From 783f433df26099ed7d0ba4178d7589bfe49df0d3 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Fri, 15 Apr 2016 00:00:02 -0700 Subject: [PATCH 01/75] feature: add initial node_value method --- sklearn/tree/_criterion.pyx | 73 +++++++++++++++++++++++++++++++++ sklearn/tree/tests/test_tree.py | 2 +- sklearn/tree/tree.py | 10 +++-- 3 files changed, 80 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 61fde29defe8d..ecc3e2d924727 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -19,6 +19,7 @@ from libc.stdlib cimport calloc from libc.stdlib cimport free from libc.string cimport memcpy from libc.string cimport memset +from libc.math cimport fabs import numpy as np cimport numpy as np @@ -962,6 +963,78 @@ cdef class MSE(RegressionCriterion): impurity_left[0] /= self.n_outputs impurity_right[0] /= self.n_outputs +cdef class MAE(RegressionCriterion): + """Mean absolute error impurity criterion + """ + cdef void node_value(self, double* dest) nogil: + """Computes the node value of samples[start:end] into dest.""" + cdef double* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + + cdef DOUBLE_t* y = self.y + cdef SIZE_t start = self.start + cdef SIZE_t end = self.end + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t w = 1.0 + cdef DOUBLE_t y_ik + + cdef DOUBLE_t sum_weights = 0 + cdef SIZE_t median_index = 0 + cdef DOUBLE_t sum + + + cdef DOUBLE_t* y_vals + cdef DOUBLE_t* weights + for k in range(self.n_outputs): + for p in range(start,end): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + y_ik = y[i * self.y_stride + k] + y_vals[p] = y_ik + weights[p] = w + + # calculate weighted median + for p in range(start, end): + sum_weights += weights[p] + sum = sum_weights - weights[0] + + while(sum > (sum_weights/2)): + median_index +=1 + sum -= weights[median_index] + dest[k] = samples[median_index] + + cdef double node_impurity(self) nogil: + """Evaluate the impurity of the current node, i.e. the impurity of + samples[start:end]""" + # todo + pass + + cdef double proxy_impurity_improvement(self) nogil: + """Compute a proxy of the impurity reduction + This method is used to speed up the search for the best split. + It is a proxy quantity such that the split that maximizes this value + also maximizes the impurity improvement. It neglects all constant terms + of the impurity decrease for a given split. + The absolute impurity improvement is only computed by the + impurity_improvement method once the best split has been found. + """ + + # todo + pass + + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil: + """Evaluate the impurity in children nodes, i.e. the impurity of the + left child (samples[start:pos]) and the impurity the right child + (samples[pos:end]). + """ + # todo + pass cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index e4ca2be5e452a..0053155f8622f 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -48,7 +48,7 @@ from sklearn.utils import compute_sample_weight CLF_CRITERIONS = ("gini", "entropy") -REG_CRITERIONS = ("mse", ) +REG_CRITERIONS = ("mse", "mae") CLF_TREES = { "DecisionTreeClassifier": DecisionTreeClassifier, diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 81fcf7f442ac2..658b9c46b298d 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -56,7 +56,8 @@ DOUBLE = _tree.DOUBLE CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy} -CRITERIA_REG = {"mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE} +CRITERIA_REG = {"mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE, + "mae": _criterion_MAE} DENSE_SPLITTERS = {"best": _splitter.BestSplitter, "random": _splitter.RandomSplitter} @@ -782,9 +783,10 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): Parameters ---------- criterion : string, optional (default="mse") - The function to measure the quality of a split. The only supported - criterion is "mse" for the mean squared error, which is equal to - variance reduction as feature selection criterion. + The function to measure the quality of a split. Supported + criterions are "mse" for the mean squared error, which is + equal to variance reduction as feature selection criterion, + and "mae" for the mean absolute deviation. splitter : string, optional (default="best") The strategy used to choose the split at each node. Supported From 68ae519880b71188f8dbb788a121b6e67fabc7f0 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Apr 2016 13:24:45 -0700 Subject: [PATCH 02/75] testing code for node_impurity and node_value This code runs into 'Bus Error: 10' at node_value final assignment. --- sklearn/tree/_criterion.pyx | 60 +++++++++++++++++++++++++++---------- sklearn/tree/tree.py | 2 +- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index ecc3e2d924727..8f26a1cb847d7 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -968,6 +968,8 @@ cdef class MAE(RegressionCriterion): """ cdef void node_value(self, double* dest) nogil: """Computes the node value of samples[start:end] into dest.""" + with gil: + print "entered node_value" cdef double* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -980,13 +982,16 @@ cdef class MAE(RegressionCriterion): cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik - cdef DOUBLE_t sum_weights = 0 + cdef DOUBLE_t sum_weights = 0.0 cdef SIZE_t median_index = 0 - cdef DOUBLE_t sum - - - cdef DOUBLE_t* y_vals - cdef DOUBLE_t* weights + cdef DOUBLE_t Sum + + y_vals = NULL + weights = NULL + y_vals = calloc(self.n_node_samples, sizeof(double)) + weights = calloc(self.n_node_samples, sizeof(double)) + cdef double* y_val_pointer = y_vals + cdef double* weight_pointer = weights for k in range(self.n_outputs): for p in range(start,end): i = samples[p] @@ -995,24 +1000,49 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] y_ik = y[i * self.y_stride + k] - y_vals[p] = y_ik - weights[p] = w + + y_val_pointer[p] = y_ik + weight_pointer[p] = w - # calculate weighted median for p in range(start, end): - sum_weights += weights[p] - sum = sum_weights - weights[0] + sum_weights += weight_pointer[p] + + Sum = sum_weights - weight_pointer[0] - while(sum > (sum_weights/2)): + while(Sum > sum_weights/2): median_index +=1 - sum -= weights[median_index] - dest[k] = samples[median_index] + Sum -= weight_pointer[median_index] + + with gil: + print "calculated weighted median:" + print y_val_pointer[median_index] + dest[k] = y_val_pointer[median_index] + with gil: + print "normally this isn't printed because of bus error: 10" cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end]""" + with gil: + print "Entered node_impurity function" + cdef double* medians + cdef double impurity = 0.0 + cdef SIZE_t* samples = self.samples + cdef SIZE_t k + cdef SIZE_t p + cdef SIZE_t i + cdef DOUBLE_t y_ik + self.node_value(medians) + with gil: + print "exited node_value" + for k in range(self.n_outputs): + for p in range(self.start, self.end): + i = samples[p] + y_ik = self.y[i * self.y_stride + k] + + impurity += fabs(y_ik - medians[k]) / self.n_node_samples + return impurity / self.n_outputs # todo - pass cdef double proxy_impurity_improvement(self) nogil: """Compute a proxy of the impurity reduction diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 658b9c46b298d..4f8ebf9e960ed 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -57,7 +57,7 @@ CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy} CRITERIA_REG = {"mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE, - "mae": _criterion_MAE} + "mae": _criterion.MAE} DENSE_SPLITTERS = {"best": _splitter.BestSplitter, "random": _splitter.RandomSplitter} From c7b640aac7f31c98b28f1945da7a0410ce347338 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Apr 2016 20:09:27 -0700 Subject: [PATCH 03/75] fix: node_value now correctly calculating weighted median for sorted data. Still need to change the code to work with unsorted data. --- sklearn/tree/_criterion.pyx | 54 +++++++++++++------------------------ 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 8f26a1cb847d7..79d65487536ca 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -968,8 +968,6 @@ cdef class MAE(RegressionCriterion): """ cdef void node_value(self, double* dest) nogil: """Computes the node value of samples[start:end] into dest.""" - with gil: - print "entered node_value" cdef double* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -982,50 +980,40 @@ cdef class MAE(RegressionCriterion): cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik - cdef DOUBLE_t sum_weights = 0.0 - cdef SIZE_t median_index = 0 - cdef DOUBLE_t Sum + cdef DOUBLE_t sum_weights + cdef SIZE_t median_index + cdef DOUBLE_t sum - y_vals = NULL - weights = NULL - y_vals = calloc(self.n_node_samples, sizeof(double)) - weights = calloc(self.n_node_samples, sizeof(double)) - cdef double* y_val_pointer = y_vals - cdef double* weight_pointer = weights + cdef double* y_vals = calloc(self.n_node_samples, sizeof(double)) + cdef double* weights = calloc(self.n_node_samples, sizeof(double)) for k in range(self.n_outputs): + median_index = 0 + sum_weights = 0.0 for p in range(start,end): i = samples[p] - + + y_ik = y[i * self.y_stride + k] if sample_weight != NULL: w = sample_weight[i] - y_ik = y[i * self.y_stride + k] - - y_val_pointer[p] = y_ik - weight_pointer[p] = w + y_vals[p] = y_ik + weights[p] = w for p in range(start, end): - sum_weights += weight_pointer[p] + sum_weights += weights[p] - Sum = sum_weights - weight_pointer[0] - - while(Sum > sum_weights/2): + sum = sum_weights - weights[0] + + while(sum > sum_weights/2): median_index +=1 - Sum -= weight_pointer[median_index] + sum -= weights[median_index] - with gil: - print "calculated weighted median:" - print y_val_pointer[median_index] - dest[k] = y_val_pointer[median_index] - with gil: - print "normally this isn't printed because of bus error: 10" + dest[k] = y_vals[median_index] cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end]""" - with gil: - print "Entered node_impurity function" - cdef double* medians + cdef double* medians = calloc(self.n_outputs, sizeof(double)) cdef double impurity = 0.0 cdef SIZE_t* samples = self.samples cdef SIZE_t k @@ -1033,16 +1021,12 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i cdef DOUBLE_t y_ik self.node_value(medians) - with gil: - print "exited node_value" for k in range(self.n_outputs): for p in range(self.start, self.end): i = samples[p] y_ik = self.y[i * self.y_stride + k] - - impurity += fabs(y_ik - medians[k]) / self.n_node_samples + impurity += fabs(y_ik - medians[k]) / self.weighted_n_node_samples return impurity / self.n_outputs - # todo cdef double proxy_impurity_improvement(self) nogil: """Compute a proxy of the impurity reduction From 2fb76516f0d69c659aed3d5c5cc07692013611d1 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 18 Apr 2016 20:23:55 -0700 Subject: [PATCH 04/75] fix: node_value now correctly calculates median regardless of initial order --- sklearn/tree/_criterion.pyx | 43 +++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 79d65487536ca..36847ed448ceb 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -986,6 +986,7 @@ cdef class MAE(RegressionCriterion): cdef double* y_vals = calloc(self.n_node_samples, sizeof(double)) cdef double* weights = calloc(self.n_node_samples, sizeof(double)) + cdef SIZE_t* sorted_indexes for k in range(self.n_outputs): median_index = 0 sum_weights = 0.0 @@ -1001,15 +1002,53 @@ cdef class MAE(RegressionCriterion): for p in range(start, end): sum_weights += weights[p] - + + self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1) + sum = sum_weights - weights[0] while(sum > sum_weights/2): median_index +=1 sum -= weights[median_index] - dest[k] = y_vals[median_index] + if start-end % 2 == 0: + dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 + else: + dest[k] = y_vals[median_index] + + cdef void sort_values_and_weights(self, double* y_vals, double* weights, + SIZE_t low, SIZE_t high) nogil: + """Sort an array and its weights""" + cdef SIZE_t pivot, i, j, + cdef double temp + if low < high: + pivot = low + i = low + j = high + while i < j: + while(y_vals[i] <= y_vals[pivot] and i <= high): + i += 1 + while(y_vals[j] > y_vals[pivot] and j >= low): + j -= 1 + if i < j: + temp = y_vals[i] + y_vals[i] = y_vals[j] + y_vals[j] = temp + + temp = weights[i] + weights[i] = weights[j] + weights[j] = temp + temp = y_vals[j] + y_vals[j] = y_vals[pivot] + y_vals[pivot] = temp + + temp = weights[j] + weights[j] = weights[pivot] + weights[pivot] = temp + self.sort_values_and_weights(y_vals, weights, low, j-1) + self.sort_values_and_weights(y_vals, weights, j+1, high) + cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end]""" From a3f2f7651a55e096473c5b18ea88fc26853ee84f Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 18 Apr 2016 20:50:33 -0700 Subject: [PATCH 05/75] fix: correct bug in calculating median when taking midpoint is necessary --- sklearn/tree/_criterion.pyx | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 36847ed448ceb..6a7e1767b5017 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -999,11 +999,20 @@ cdef class MAE(RegressionCriterion): y_vals[p] = y_ik weights[p] = w + with gil: + print "p {}".format(p) + print "unsorted y val {}".format(y_vals[p]) + print "unsorted weight {}".format(weights[p]) for p in range(start, end): sum_weights += weights[p] self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1) + for p in range(start, end): + with gil: + print "p {}".format(p) + print "sorted y val {}".format(y_vals[p]) + print "sorted weight {}".format(weights[p]) sum = sum_weights - weights[0] @@ -1011,10 +1020,12 @@ cdef class MAE(RegressionCriterion): median_index +=1 sum -= weights[median_index] - if start-end % 2 == 0: + if sum == sum_weights/2: dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 else: dest[k] = y_vals[median_index] + with gil: + print dest[k] cdef void sort_values_and_weights(self, double* y_vals, double* weights, SIZE_t low, SIZE_t high) nogil: From c40a54b752be52024ebc0c4ff8b0080be68b42b5 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 19 Apr 2016 15:36:55 -0700 Subject: [PATCH 06/75] feature: add initial version of children_impurity --- sklearn/tree/_criterion.pyx | 77 ++++++++++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 13 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 6a7e1767b5017..c8d32aaf2fc96 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -999,20 +999,11 @@ cdef class MAE(RegressionCriterion): y_vals[p] = y_ik weights[p] = w - with gil: - print "p {}".format(p) - print "unsorted y val {}".format(y_vals[p]) - print "unsorted weight {}".format(weights[p]) for p in range(start, end): sum_weights += weights[p] self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1) - for p in range(start, end): - with gil: - print "p {}".format(p) - print "sorted y val {}".format(y_vals[p]) - print "sorted weight {}".format(weights[p]) sum = sum_weights - weights[0] @@ -1024,8 +1015,6 @@ cdef class MAE(RegressionCriterion): dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 else: dest[k] = y_vals[median_index] - with gil: - print dest[k] cdef void sort_values_and_weights(self, double* y_vals, double* weights, SIZE_t low, SIZE_t high) nogil: @@ -1087,6 +1076,9 @@ cdef class MAE(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ + cdef SIZE_t k + cdef double proxy_impurity_left = 0.0 + cdef double proxy_impurity_right = 0.0 # todo pass @@ -1097,8 +1089,67 @@ cdef class MAE(RegressionCriterion): left child (samples[start:pos]) and the impurity the right child (samples[pos:end]). """ - # todo - pass + cdef DOUBLE_t* y = self.y + cdef DOUBLE_t* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + + cdef SIZE_t pos = self.pos + cdef SIZE_t start = self.start + + cdef double impurity_total = self.node_impurity() + + cdef DOUBLE_t sum_weights + cdef SIZE_t median_index + cdef DOUBLE_t sum + + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t w = 1.0 + cdef DOUBLE_t y_ik + + cdef double* y_vals = calloc(self.n_node_samples, sizeof(double)) + cdef double* weights = calloc(self.n_node_samples, sizeof(double)) + cdef double* medians = calloc(self.n_outputs, sizeof(double)) + + for k in range(self.n_outputs): + median_index = 0 + sum_weights = 0.0 + for p in range(start, pos): + i = samples[p] + y_ik = y[i * self.y_stride + k] + + if sample_weight != NULL: + w = sample_weight[i] + + y_vals[p] = y_ik + weights[p] = w + + for p in range(start, pos): + sum_weights += weights[p] + + self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1) + sum = sum_weights - weights[0] + + while(sum > sum_weights/2): + median_index +=1 + sum -= weights[median_index] + + if sum == sum_weights/2: + medians[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 + else: + medians[k] = y_vals[median_index] + + for k in range(self.n_outputs): + for p in range(start, pos): + i = samples[p] + y_ik = y[i * self.y_stride + k] + impurity_left[0] += fabs(y_ik - medians[k]) / (pos - start) + + impurity_right[0] = impurity_total - impurity_left[0] + + impurity_left[0] /= self.n_outputs + impurity_right[0] /= self.n_outputs cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman From 19e811dcab439a0254512d516ecbad2581fe0b7d Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 19 Apr 2016 16:33:03 -0700 Subject: [PATCH 07/75] feature: refactor median calculation into one function --- sklearn/tree/_criterion.pyx | 75 +++++++++++-------------------------- 1 file changed, 22 insertions(+), 53 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index c8d32aaf2fc96..f95c41ccf71a7 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -967,26 +967,30 @@ cdef class MAE(RegressionCriterion): """Mean absolute error impurity criterion """ cdef void node_value(self, double* dest) nogil: - """Computes the node value of samples[start:end] into dest.""" - cdef double* sample_weight = self.sample_weight - cdef SIZE_t* samples = self.samples - - cdef DOUBLE_t* y = self.y + """Computes the node value of samples[start:end] into dest.""" cdef SIZE_t start = self.start cdef SIZE_t end = self.end - cdef SIZE_t i - cdef SIZE_t p - cdef SIZE_t k + + cdef double* y_vals = calloc(self.n_node_samples, sizeof(double)) + cdef double* weights = calloc(self.n_node_samples, sizeof(double)) + self.compute_weighted_median(dest, y_vals, weights, start, end) + + cdef void compute_weighted_median(self, double* median_dest, double* y_vals, + double* weights, SIZE_t start, SIZE_t end) nogil: + """Calculate the weighted median and put it into a destination pointer + given values, weights, and a start and end index + """ + cdef double* sample_weight = self.sample_weight + cdef DOUBLE_t* y = self.y + cdef SIZE_t* samples = self.samples cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik + cdef SIZE_t i, p, k cdef DOUBLE_t sum_weights cdef SIZE_t median_index cdef DOUBLE_t sum - cdef double* y_vals = calloc(self.n_node_samples, sizeof(double)) - cdef double* weights = calloc(self.n_node_samples, sizeof(double)) - cdef SIZE_t* sorted_indexes for k in range(self.n_outputs): median_index = 0 sum_weights = 0.0 @@ -999,7 +1003,7 @@ cdef class MAE(RegressionCriterion): y_vals[p] = y_ik weights[p] = w - + for p in range(start, end): sum_weights += weights[p] @@ -1012,9 +1016,10 @@ cdef class MAE(RegressionCriterion): sum -= weights[median_index] if sum == sum_weights/2: - dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 + median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 else: - dest[k] = y_vals[median_index] + median_dest[k] = y_vals[median_index] + cdef void sort_values_and_weights(self, double* y_vals, double* weights, SIZE_t low, SIZE_t high) nogil: @@ -1076,9 +1081,6 @@ cdef class MAE(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ - cdef SIZE_t k - cdef double proxy_impurity_left = 0.0 - cdef double proxy_impurity_right = 0.0 # todo pass @@ -1098,48 +1100,15 @@ cdef class MAE(RegressionCriterion): cdef double impurity_total = self.node_impurity() - cdef DOUBLE_t sum_weights - cdef SIZE_t median_index - cdef DOUBLE_t sum - - cdef SIZE_t i - cdef SIZE_t p - cdef SIZE_t k - cdef DOUBLE_t w = 1.0 + cdef SIZE_t i, p, k cdef DOUBLE_t y_ik cdef double* y_vals = calloc(self.n_node_samples, sizeof(double)) cdef double* weights = calloc(self.n_node_samples, sizeof(double)) cdef double* medians = calloc(self.n_outputs, sizeof(double)) + self.compute_weighted_median(medians, y_vals, weights, start, pos) + - for k in range(self.n_outputs): - median_index = 0 - sum_weights = 0.0 - for p in range(start, pos): - i = samples[p] - y_ik = y[i * self.y_stride + k] - - if sample_weight != NULL: - w = sample_weight[i] - - y_vals[p] = y_ik - weights[p] = w - - for p in range(start, pos): - sum_weights += weights[p] - - self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1) - sum = sum_weights - weights[0] - - while(sum > sum_weights/2): - median_index +=1 - sum -= weights[median_index] - - if sum == sum_weights/2: - medians[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 - else: - medians[k] = y_vals[median_index] - for k in range(self.n_outputs): for p in range(start, pos): i = samples[p] From 31f04b40f2418a87146e64da9503b07735398817 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 21 Apr 2016 15:08:27 -0700 Subject: [PATCH 08/75] fix: fix use of DOUBLE_t vs double --- sklearn/tree/_criterion.pyx | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index f95c41ccf71a7..e038c38173b79 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -971,12 +971,12 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t start = self.start cdef SIZE_t end = self.end - cdef double* y_vals = calloc(self.n_node_samples, sizeof(double)) - cdef double* weights = calloc(self.n_node_samples, sizeof(double)) + cdef double* y_vals = calloc(self.n_node_samples, sizeof(DOUBLE_t)) + cdef double* weights = calloc(self.n_node_samples, sizeof(DOUBLE_t)) self.compute_weighted_median(dest, y_vals, weights, start, end) - cdef void compute_weighted_median(self, double* median_dest, double* y_vals, - double* weights, SIZE_t start, SIZE_t end) nogil: + cdef void compute_weighted_median(self, double* median_dest, DOUBLE_t* y_vals, + DOUBLE_t* weights, SIZE_t start, SIZE_t end) nogil: """Calculate the weighted median and put it into a destination pointer given values, weights, and a start and end index """ @@ -1021,7 +1021,7 @@ cdef class MAE(RegressionCriterion): median_dest[k] = y_vals[median_index] - cdef void sort_values_and_weights(self, double* y_vals, double* weights, + cdef void sort_values_and_weights(self, DOUBLE_t* y_vals, DOUBLE_t* weights, SIZE_t low, SIZE_t high) nogil: """Sort an array and its weights""" cdef SIZE_t pivot, i, j, @@ -1052,7 +1052,6 @@ cdef class MAE(RegressionCriterion): weights[pivot] = temp self.sort_values_and_weights(y_vals, weights, low, j-1) self.sort_values_and_weights(y_vals, weights, j+1, high) - cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of @@ -1103,8 +1102,8 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i, p, k cdef DOUBLE_t y_ik - cdef double* y_vals = calloc(self.n_node_samples, sizeof(double)) - cdef double* weights = calloc(self.n_node_samples, sizeof(double)) + cdef DOUBLE_t* y_vals = calloc(self.n_node_samples, sizeof(DOUBLE_t)) + cdef DOUBLE_t* weights = calloc(self.n_node_samples, sizeof(DOUBLE_t)) cdef double* medians = calloc(self.n_outputs, sizeof(double)) self.compute_weighted_median(medians, y_vals, weights, start, pos) From ffff6166468a1a43bf228e7b2276edb734b067e9 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 11 May 2016 23:49:31 -0700 Subject: [PATCH 09/75] feature: move helper functions to _utils.pyx, fix mismatched pointer type --- sklearn/tree/_criterion.pyx | 119 ++++++++---------------------------- sklearn/tree/_utils.pxd | 7 +++ sklearn/tree/_utils.pyx | 84 +++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 95 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index e038c38173b79..20d1eb3d7c6ef 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -28,6 +28,7 @@ np.import_array() from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray +from ._utils cimport compute_weighted_median cdef class Criterion: """Interface for impurity criteria. @@ -848,7 +849,7 @@ cdef class RegressionCriterion(Criterion): self.weighted_n_left -= w - self.weighted_n_right = (self.weighted_n_node_samples - + self.weighted_n_right = (self.weighted_n_node_samples - self.weighted_n_left) for k in range(self.n_outputs): sum_right[k] = sum_total[k] - sum_left[k] @@ -922,7 +923,6 @@ cdef class MSE(RegressionCriterion): left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - cdef DOUBLE_t* y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -958,7 +958,7 @@ cdef class MSE(RegressionCriterion): for k in range(self.n_outputs): impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 impurity_left[0] /= self.n_outputs impurity_right[0] /= self.n_outputs @@ -967,101 +967,26 @@ cdef class MAE(RegressionCriterion): """Mean absolute error impurity criterion """ cdef void node_value(self, double* dest) nogil: - """Computes the node value of samples[start:end] into dest.""" + """Computes the node value of samples[start:end] into dest.""" cdef SIZE_t start = self.start cdef SIZE_t end = self.end - cdef double* y_vals = calloc(self.n_node_samples, sizeof(DOUBLE_t)) - cdef double* weights = calloc(self.n_node_samples, sizeof(DOUBLE_t)) - self.compute_weighted_median(dest, y_vals, weights, start, end) - - cdef void compute_weighted_median(self, double* median_dest, DOUBLE_t* y_vals, - DOUBLE_t* weights, SIZE_t start, SIZE_t end) nogil: - """Calculate the weighted median and put it into a destination pointer - given values, weights, and a start and end index - """ - cdef double* sample_weight = self.sample_weight - cdef DOUBLE_t* y = self.y - cdef SIZE_t* samples = self.samples - cdef DOUBLE_t w = 1.0 - cdef DOUBLE_t y_ik - - cdef SIZE_t i, p, k - cdef DOUBLE_t sum_weights - cdef SIZE_t median_index - cdef DOUBLE_t sum - - for k in range(self.n_outputs): - median_index = 0 - sum_weights = 0.0 - for p in range(start,end): - i = samples[p] - - y_ik = y[i * self.y_stride + k] - if sample_weight != NULL: - w = sample_weight[i] + cdef DOUBLE_t* y_vals = calloc(self.n_node_samples, + sizeof(DOUBLE_t)) + cdef DOUBLE_t* weights = calloc(self.n_node_samples, + sizeof(DOUBLE_t)) + compute_weighted_median(dest, y_vals, weights, start, end, + self.sample_weight, self.y, self.samples, + self.y_stride, self.n_node_samples, + self.n_outputs) - y_vals[p] = y_ik - weights[p] = w - - for p in range(start, end): - sum_weights += weights[p] - - self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1) - - sum = sum_weights - weights[0] - - while(sum > sum_weights/2): - median_index +=1 - sum -= weights[median_index] - - if sum == sum_weights/2: - median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 - else: - median_dest[k] = y_vals[median_index] - - - cdef void sort_values_and_weights(self, DOUBLE_t* y_vals, DOUBLE_t* weights, - SIZE_t low, SIZE_t high) nogil: - """Sort an array and its weights""" - cdef SIZE_t pivot, i, j, - cdef double temp - if low < high: - pivot = low - i = low - j = high - while i < j: - while(y_vals[i] <= y_vals[pivot] and i <= high): - i += 1 - while(y_vals[j] > y_vals[pivot] and j >= low): - j -= 1 - if i < j: - temp = y_vals[i] - y_vals[i] = y_vals[j] - y_vals[j] = temp - - temp = weights[i] - weights[i] = weights[j] - weights[j] = temp - temp = y_vals[j] - y_vals[j] = y_vals[pivot] - y_vals[pivot] = temp - - temp = weights[j] - weights[j] = weights[pivot] - weights[pivot] = temp - self.sort_values_and_weights(y_vals, weights, low, j-1) - self.sort_values_and_weights(y_vals, weights, j+1, high) - cdef double node_impurity(self) nogil: - """Evaluate the impurity of the current node, i.e. the impurity of + """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end]""" cdef double* medians = calloc(self.n_outputs, sizeof(double)) cdef double impurity = 0.0 cdef SIZE_t* samples = self.samples - cdef SIZE_t k - cdef SIZE_t p - cdef SIZE_t i + cdef SIZE_t i, p, k cdef DOUBLE_t y_ik self.node_value(medians) for k in range(self.n_outputs): @@ -1102,12 +1027,16 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i, p, k cdef DOUBLE_t y_ik - cdef DOUBLE_t* y_vals = calloc(self.n_node_samples, sizeof(DOUBLE_t)) - cdef DOUBLE_t* weights = calloc(self.n_node_samples, sizeof(DOUBLE_t)) + cdef DOUBLE_t* y_vals = calloc(self.n_node_samples, + sizeof(DOUBLE_t)) + cdef DOUBLE_t* weights = calloc(self.n_node_samples, + sizeof(DOUBLE_t)) cdef double* medians = calloc(self.n_outputs, sizeof(double)) - self.compute_weighted_median(medians, y_vals, weights, start, pos) - - + compute_weighted_median(medians, y_vals, weights, start, pos, + self.sample_weight, self.y, self.samples, + self.y_stride, self.n_node_samples, + self.n_outputs) + for k in range(self.n_outputs): for p in range(start, pos): i = samples[p] @@ -1175,5 +1104,5 @@ cdef class FriedmanMSE(MSE): diff = (self.weighted_n_right * total_sum_left - self.weighted_n_left * total_sum_right) / self.n_outputs - return (diff * diff / (self.weighted_n_left * self.weighted_n_right * + return (diff * diff / (self.weighted_n_left * self.weighted_n_right * self.weighted_n_node_samples)) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 9537bbb91cf27..69e023ce83961 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -39,6 +39,13 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except * cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size) +cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals, + DOUBLE_t* weights, SIZE_t start, SIZE_t end, + DOUBLE_t* sample_weight, DOUBLE_t* y, + SIZE_t* samples, SIZE_t y_stride, + SIZE_t n_node_samples, + SIZE_t n_outputs) nogil + cdef SIZE_t rand_int(SIZE_t low, SIZE_t high, UINT32_t* random_state) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 6a3833128b5fa..5f508521c2069 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -49,6 +49,90 @@ def _realloc_test(): assert False +cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals, + DOUBLE_t* weights, SIZE_t start, SIZE_t end, + DOUBLE_t* sample_weight, DOUBLE_t* y, + SIZE_t* samples, SIZE_t y_stride, + SIZE_t n_node_samples, + SIZE_t n_outputs) nogil: + """Calculate the weighted median and put it into a destination pointer + given values, weights, and a start and end index + """ + # cdef DOUBLE_t* sample_weight = self.sample_weight + # cdef DOUBLE_t* y = self.y + # cdef SIZE_t* samples = self.samples + cdef DOUBLE_t w = 1.0 + cdef DOUBLE_t y_ik + + cdef SIZE_t i, p, k + cdef DOUBLE_t sum_weights + cdef SIZE_t median_index + cdef DOUBLE_t sum + + for k in range(n_outputs): + median_index = 0 + sum_weights = 0.0 + for p in range(start,end): + i = samples[p] + + y_ik = y[i * y_stride + k] + if sample_weight != NULL: + w = sample_weight[i] + + y_vals[p] = y_ik + weights[p] = w + + for p in range(start, end): + sum_weights += weights[p] + + # self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1) + sort_values_and_weights(y_vals, weights, 0, n_node_samples - 1) + + sum = sum_weights - weights[0] + + while(sum > sum_weights/2): + median_index +=1 + sum -= weights[median_index] + + if sum == sum_weights/2: + median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 + else: + median_dest[k] = y_vals[median_index] + + +cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, + SIZE_t low, SIZE_t high) nogil: + """Sort an array and its weights""" + cdef SIZE_t pivot, i, j, + cdef double temp + if low < high: + pivot = low + i = low + j = high + while i < j: + while(y_vals[i] <= y_vals[pivot] and i <= high): + i += 1 + while(y_vals[j] > y_vals[pivot] and j >= low): + j -= 1 + if i < j: + temp = y_vals[i] + y_vals[i] = y_vals[j] + y_vals[j] = temp + + temp = weights[i] + weights[i] = weights[j] + weights[j] = temp + temp = y_vals[j] + y_vals[j] = y_vals[pivot] + y_vals[pivot] = temp + + temp = weights[j] + weights[j] = weights[pivot] + weights[pivot] = temp + sort_values_and_weights(y_vals, weights, low, j-1) + sort_values_and_weights(y_vals, weights, j+1, high) + + # rand_r replacement using a 32bit XorShift generator # See http://www.jstatsoft.org/v08/i14/paper for details cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil: From bfde38d4fdcceb5c61d49dc915233b4feef21236 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 14 May 2016 16:58:38 -0700 Subject: [PATCH 10/75] fix: fix some bugs in children_impurity method --- sklearn/tree/_criterion.pxd | 2 +- sklearn/tree/_criterion.pyx | 21 +++++++++++++-------- sklearn/tree/_utils.pxd | 1 - sklearn/tree/_utils.pyx | 9 ++++----- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 889a623d732b3..172d57659e6a6 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -44,7 +44,7 @@ cdef class Criterion: # weighted count of each label. For regression, # the sum of w*y. sum_total[k] is equal to # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k], - # where k is output index. + # where k is output index. cdef double* sum_left # Same as above, but for the left side of the split cdef double* sum_right # same as above, but for the right side of the split diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 20d1eb3d7c6ef..3c5d61dc3ffed 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -977,8 +977,7 @@ cdef class MAE(RegressionCriterion): sizeof(DOUBLE_t)) compute_weighted_median(dest, y_vals, weights, start, end, self.sample_weight, self.y, self.samples, - self.y_stride, self.n_node_samples, - self.n_outputs) + self.y_stride, self.n_outputs) cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of @@ -994,6 +993,8 @@ cdef class MAE(RegressionCriterion): i = samples[p] y_ik = self.y[i * self.y_stride + k] impurity += fabs(y_ik - medians[k]) / self.weighted_n_node_samples + with gil: + print "impurity / self.n_outputs = {} / {} = {}".format(impurity, self.n_outputs, impurity / self.n_outputs) return impurity / self.n_outputs cdef double proxy_impurity_improvement(self) nogil: @@ -1019,8 +1020,9 @@ cdef class MAE(RegressionCriterion): cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples - cdef SIZE_t pos = self.pos cdef SIZE_t start = self.start + cdef SIZE_t pos = self.pos + cdef SIZE_t end = self.end cdef double impurity_total = self.node_impurity() @@ -1031,11 +1033,10 @@ cdef class MAE(RegressionCriterion): sizeof(DOUBLE_t)) cdef DOUBLE_t* weights = calloc(self.n_node_samples, sizeof(DOUBLE_t)) - cdef double* medians = calloc(self.n_outputs, sizeof(double)) + cdef double* medians = calloc(self.n_outputs, sizeof(double)) compute_weighted_median(medians, y_vals, weights, start, pos, self.sample_weight, self.y, self.samples, - self.y_stride, self.n_node_samples, - self.n_outputs) + self.y_stride, self.n_outputs) for k in range(self.n_outputs): for p in range(start, pos): @@ -1044,9 +1045,13 @@ cdef class MAE(RegressionCriterion): impurity_left[0] += fabs(y_ik - medians[k]) / (pos - start) impurity_right[0] = impurity_total - impurity_left[0] + with gil: + print "start: {}".format(start) + print "pos: {}".format(pos) + print "end: {}".format(end) + print "impurity_left[0]: {}".format(impurity_left[0]) + print "impurity_right[0]: {}".format(impurity_right[0]) - impurity_left[0] /= self.n_outputs - impurity_right[0] /= self.n_outputs cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 69e023ce83961..cafecca9d124f 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -43,7 +43,6 @@ cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals, DOUBLE_t* weights, SIZE_t start, SIZE_t end, DOUBLE_t* sample_weight, DOUBLE_t* y, SIZE_t* samples, SIZE_t y_stride, - SIZE_t n_node_samples, SIZE_t n_outputs) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 5f508521c2069..72a4e1c828b2e 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -53,16 +53,17 @@ cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals, DOUBLE_t* weights, SIZE_t start, SIZE_t end, DOUBLE_t* sample_weight, DOUBLE_t* y, SIZE_t* samples, SIZE_t y_stride, - SIZE_t n_node_samples, SIZE_t n_outputs) nogil: - """Calculate the weighted median and put it into a destination pointer - given values, weights, and a start and end index + """Calculate the weighted median of samples[start:end] and put + it into a destination pointer + given values, weights, and a start and end index. """ # cdef DOUBLE_t* sample_weight = self.sample_weight # cdef DOUBLE_t* y = self.y # cdef SIZE_t* samples = self.samples cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik + cdef SIZE_t n_node_samples = end-start cdef SIZE_t i, p, k cdef DOUBLE_t sum_weights @@ -85,9 +86,7 @@ cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals, for p in range(start, end): sum_weights += weights[p] - # self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1) sort_values_and_weights(y_vals, weights, 0, n_node_samples - 1) - sum = sum_weights - weights[0] while(sum > sum_weights/2): From 8b77de01bf4bb916cc111fc1e075f528f351a6e6 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 18 May 2016 19:18:13 -0700 Subject: [PATCH 11/75] push a debug version to try to solve segfault --- sklearn/tree/_criterion.pyx | 84 +++++++++++++++++++++++-------------- sklearn/tree/_splitter.pyx | 6 ++- sklearn/tree/_tree.pyx | 6 ++- sklearn/tree/_utils.pyx | 16 +++---- sklearn/tree/tree.py | 2 +- 5 files changed, 71 insertions(+), 43 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 3c5d61dc3ffed..0a0186fdd6390 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -39,10 +39,11 @@ cdef class Criterion: def __dealloc__(self): """Destructor.""" - + print "entered criterion dealloc" free(self.sum_total) free(self.sum_left) free(self.sum_right) + print "exited criterion dealloc" def __getstate__(self): return {} @@ -170,6 +171,7 @@ cdef class Criterion: return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) + cdef double impurity_improvement(self, double impurity) nogil: """Placeholder for improvement in impurity after a split. @@ -200,9 +202,9 @@ cdef class Criterion: self.children_impurity(&impurity_left, &impurity_right) return ((self.weighted_n_node_samples / self.weighted_n_samples) * - (impurity - (self.weighted_n_right / + (impurity - (self.weighted_n_right / self.weighted_n_node_samples * impurity_right) - - (self.weighted_n_left / + - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) @@ -265,14 +267,14 @@ cdef class ClassificationCriterion(Criterion): self.sum_left = calloc(n_elements, sizeof(double)) self.sum_right = calloc(n_elements, sizeof(double)) - if (self.sum_total == NULL or + if (self.sum_total == NULL or self.sum_left == NULL or self.sum_right == NULL): raise MemoryError() def __dealloc__(self): """Destructor.""" - + print "entered classificationcriterion dealloc" free(self.n_classes) def __reduce__(self): @@ -724,7 +726,8 @@ cdef class RegressionCriterion(Criterion): self.sum_left = calloc(n_outputs, sizeof(double)) self.sum_right = calloc(n_outputs, sizeof(double)) - if (self.sum_total == NULL or + + if (self.sum_total == NULL or self.sum_left == NULL or self.sum_right == NULL): raise MemoryError() @@ -970,11 +973,15 @@ cdef class MAE(RegressionCriterion): """Computes the node value of samples[start:end] into dest.""" cdef SIZE_t start = self.start cdef SIZE_t end = self.end - - cdef DOUBLE_t* y_vals = calloc(self.n_node_samples, - sizeof(DOUBLE_t)) - cdef DOUBLE_t* weights = calloc(self.n_node_samples, - sizeof(DOUBLE_t)) + cdef double* y_vals = NULL + cdef double* weights = NULL + y_vals = calloc(self.n_node_samples, + sizeof(double)) + weights = calloc(self.n_node_samples, + sizeof(double)) + if (y_vals == NULL or weights == NULL): + with gil: + raise MemoryError() compute_weighted_median(dest, y_vals, weights, start, end, self.sample_weight, self.y, self.samples, self.y_stride, self.n_outputs) @@ -982,7 +989,13 @@ cdef class MAE(RegressionCriterion): cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end]""" - cdef double* medians = calloc(self.n_outputs, sizeof(double)) + with gil: + print "entered node_impurity" + cdef double* medians = NULL + medians = calloc(self.n_outputs, sizeof(double)) + if (medians == NULL): + with gil: + raise MemoryError() cdef double impurity = 0.0 cdef SIZE_t* samples = self.samples cdef SIZE_t i, p, k @@ -992,23 +1005,23 @@ cdef class MAE(RegressionCriterion): for p in range(self.start, self.end): i = samples[p] y_ik = self.y[i * self.y_stride + k] - impurity += fabs(y_ik - medians[k]) / self.weighted_n_node_samples + impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples) with gil: print "impurity / self.n_outputs = {} / {} = {}".format(impurity, self.n_outputs, impurity / self.n_outputs) return impurity / self.n_outputs - cdef double proxy_impurity_improvement(self) nogil: - """Compute a proxy of the impurity reduction - This method is used to speed up the search for the best split. - It is a proxy quantity such that the split that maximizes this value - also maximizes the impurity improvement. It neglects all constant terms - of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the - impurity_improvement method once the best split has been found. - """ + # cdef double proxy_impurity_improvement(self) nogil: + # """Compute a proxy of the impurity reduction + # This method is used to speed up the search for the best split. + # It is a proxy quantity such that the split that maximizes this value + # also maximizes the impurity improvement. It neglects all constant terms + # of the impurity decrease for a given split. + # The absolute impurity improvement is only computed by the + # impurity_improvement method once the best split has been found. + # """ - # todo - pass + # # todo + # pass cdef void children_impurity(self, double* impurity_left, double* impurity_right) nogil: @@ -1029,11 +1042,18 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i, p, k cdef DOUBLE_t y_ik - cdef DOUBLE_t* y_vals = calloc(self.n_node_samples, - sizeof(DOUBLE_t)) - cdef DOUBLE_t* weights = calloc(self.n_node_samples, - sizeof(DOUBLE_t)) - cdef double* medians = calloc(self.n_outputs, sizeof(double)) + cdef double* y_vals = NULL + cdef double* weights = NULL + cdef double* medians = NULL + + y_vals = calloc(self.n_node_samples, + sizeof(double)) + weights = calloc(self.n_node_samples, + sizeof(double)) + medians = calloc(self.n_outputs, sizeof(double)) + if (y_vals == NULL or weights == NULL or medians == NULL): + with gil: + raise MemoryError() compute_weighted_median(medians, y_vals, weights, start, pos, self.sample_weight, self.y, self.samples, self.y_stride, self.n_outputs) @@ -1046,9 +1066,9 @@ cdef class MAE(RegressionCriterion): impurity_right[0] = impurity_total - impurity_left[0] with gil: - print "start: {}".format(start) - print "pos: {}".format(pos) - print "end: {}".format(end) + # print "start: {}".format(start) + # print "pos: {}".format(pos) + # print "end: {}".format(end) print "impurity_left[0]: {}".format(impurity_left[0]) print "impurity_right[0]: {}".format(impurity_right[0]) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 0617508aab236..c2ea129a86f25 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -104,11 +104,12 @@ cdef class Splitter: def __dealloc__(self): """Destructor.""" - + print "entered splitter dealloc" free(self.samples) free(self.features) free(self.constant_features) free(self.feature_values) + print "exited splitter dealloc" def __getstate__(self): return {} @@ -253,9 +254,11 @@ cdef class BaseDenseSplitter(Splitter): self.presort = presort def __dealloc__(self): + print "entered basedensesplitter dealloc" """Destructor.""" if self.presort == 1: free(self.sample_mask) + print "exited basedensesplitter dealloc" cdef void init(self, object X, @@ -861,6 +864,7 @@ cdef class BaseSparseSplitter(Splitter): self.sorted_samples = NULL def __dealloc__(self): + print "entered basesparsesplitter dealloc" """Deallocate memory.""" free(self.index_to_samples) free(self.sorted_samples) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f44320a7b47ae..d0f384589a626 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -264,7 +264,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): tree.max_depth = max_depth_seen if rc == -1: raise MemoryError() - + print "dont building" # Best first builder ---------------------------------------------------------- @@ -603,6 +603,7 @@ cdef class Tree: def __dealloc__(self): """Destructor.""" + print "entered tree dealloc" # Free all inner structures free(self.n_classes) free(self.value) @@ -798,6 +799,7 @@ cdef class Tree: cdef inline np.ndarray _apply_sparse_csr(self, object X): """Finds the terminal region (=leaf node) for each sample in sparse X. """ + print "entered _apply_sparse_csr in tree.pyx" # Check input if not isinstance(X, csr_matrix): raise ValueError("X should be in csr_matrix format, got %s" @@ -939,7 +941,7 @@ cdef class Tree: cdef inline object _decision_path_sparse_csr(self, object X): """Finds the decision path (=node) for each sample in X.""" - + print "entered _decision_path_sparse_csr in tree.pyx" # Check input if not isinstance(X, csr_matrix): raise ValueError("X should be in csr_matrix format, got %s" diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 72a4e1c828b2e..4c70df1159af5 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -45,12 +45,13 @@ def _realloc_test(): cdef SIZE_t* p = NULL safe_realloc(&p, (-1) / 2) if p != NULL: + print "entered free in dealloc test" free(p) assert False -cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals, - DOUBLE_t* weights, SIZE_t start, SIZE_t end, +cdef void compute_weighted_median(double* median_dest, double* y_vals, + double* weights, SIZE_t start, SIZE_t end, DOUBLE_t* sample_weight, DOUBLE_t* y, SIZE_t* samples, SIZE_t y_stride, SIZE_t n_outputs) nogil: @@ -58,12 +59,9 @@ cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals, it into a destination pointer given values, weights, and a start and end index. """ - # cdef DOUBLE_t* sample_weight = self.sample_weight - # cdef DOUBLE_t* y = self.y - # cdef SIZE_t* samples = self.samples cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik - cdef SIZE_t n_node_samples = end-start + cdef SIZE_t n_node_samples = end - start cdef SIZE_t i, p, k cdef DOUBLE_t sum_weights @@ -99,7 +97,8 @@ cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals, median_dest[k] = y_vals[median_index] -cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, + +cdef void sort_values_and_weights(double* y_vals, double* weights, SIZE_t low, SIZE_t high) nogil: """Sort an array and its weights""" cdef SIZE_t pivot, i, j, @@ -194,7 +193,9 @@ cdef class Stack: raise MemoryError() def __dealloc__(self): + print "entered dealloc in stack" free(self.stack_) + print "exited dealloc in stack" cdef bint is_empty(self) nogil: return self.top <= 0 @@ -316,6 +317,7 @@ cdef class PriorityHeap: raise MemoryError() def __dealloc__(self): + print "entered dealloc in priorityheap" free(self.heap_) cdef bint is_empty(self) nogil: diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 4f8ebf9e960ed..d61fe54fa1198 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -371,7 +371,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] - + print "done fitting" return self def _validate_X_predict(self, X, check_input): From adb244d8b41f0ab25ad72224dbb46c02b28c8387 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 18 May 2016 21:48:45 -0700 Subject: [PATCH 12/75] push latest changes, segfault probably happening bc of something in _utils.pyx --- sklearn/tree/_criterion.pyx | 33 ++++++++++----------------------- sklearn/tree/_utils.pxd | 3 +-- sklearn/tree/_utils.pyx | 31 ++++++++++++++++++++++--------- 3 files changed, 33 insertions(+), 34 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 0a0186fdd6390..c31882fc7b838 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -973,18 +973,9 @@ cdef class MAE(RegressionCriterion): """Computes the node value of samples[start:end] into dest.""" cdef SIZE_t start = self.start cdef SIZE_t end = self.end - cdef double* y_vals = NULL - cdef double* weights = NULL - y_vals = calloc(self.n_node_samples, - sizeof(double)) - weights = calloc(self.n_node_samples, - sizeof(double)) - if (y_vals == NULL or weights == NULL): - with gil: - raise MemoryError() - compute_weighted_median(dest, y_vals, weights, start, end, - self.sample_weight, self.y, self.samples, - self.y_stride, self.n_outputs) + + compute_weighted_median(dest, start, end, self.sample_weight, self.y, + self.samples, self.y_stride, self.n_outputs) cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of @@ -992,7 +983,7 @@ cdef class MAE(RegressionCriterion): with gil: print "entered node_impurity" cdef double* medians = NULL - medians = calloc(self.n_outputs, sizeof(double)) + medians = calloc(self.n_outputs, sizeof(double)) if (medians == NULL): with gil: raise MemoryError() @@ -1042,21 +1033,17 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i, p, k cdef DOUBLE_t y_ik - cdef double* y_vals = NULL - cdef double* weights = NULL cdef double* medians = NULL - y_vals = calloc(self.n_node_samples, - sizeof(double)) - weights = calloc(self.n_node_samples, - sizeof(double)) medians = calloc(self.n_outputs, sizeof(double)) - if (y_vals == NULL or weights == NULL or medians == NULL): + if (medians == NULL): with gil: raise MemoryError() - compute_weighted_median(medians, y_vals, weights, start, pos, - self.sample_weight, self.y, self.samples, - self.y_stride, self.n_outputs) + for k in range(self.n_outputs): + medians[k] = k + compute_weighted_median(medians, start, pos, self.sample_weight, + self.y, self.samples, self.y_stride, + self.n_outputs) for k in range(self.n_outputs): for p in range(start, pos): diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index cafecca9d124f..0678675ab1175 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -39,8 +39,7 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except * cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size) -cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals, - DOUBLE_t* weights, SIZE_t start, SIZE_t end, +cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, DOUBLE_t* sample_weight, DOUBLE_t* y, SIZE_t* samples, SIZE_t y_stride, SIZE_t n_outputs) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 4c70df1159af5..5afbb1051924a 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -12,9 +12,11 @@ from libc.stdlib cimport free from libc.stdlib cimport malloc +from libc.stdlib cimport calloc from libc.stdlib cimport realloc from libc.math cimport log as ln + import numpy as np cimport numpy as np np.import_array() @@ -50,8 +52,7 @@ def _realloc_test(): assert False -cdef void compute_weighted_median(double* median_dest, double* y_vals, - double* weights, SIZE_t start, SIZE_t end, +cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, DOUBLE_t* sample_weight, DOUBLE_t* y, SIZE_t* samples, SIZE_t y_stride, SIZE_t n_outputs) nogil: @@ -68,6 +69,18 @@ cdef void compute_weighted_median(double* median_dest, double* y_vals, cdef SIZE_t median_index cdef DOUBLE_t sum + cdef DOUBLE_t* y_vals = NULL + cdef DOUBLE_t* weights = NULL + y_vals = calloc(n_node_samples, + sizeof(DOUBLE_t)) + weights = calloc(n_node_samples, + sizeof(DOUBLE_t)) + + if (y_vals == NULL or weights == NULL): + with gil: + raise MemoryError() + + for k in range(n_outputs): median_index = 0 sum_weights = 0.0 @@ -78,17 +91,17 @@ cdef void compute_weighted_median(double* median_dest, double* y_vals, if sample_weight != NULL: w = sample_weight[i] - y_vals[p] = y_ik weights[p] = w - + y_vals[p] = y_ik + sort_values_and_weights(y_vals, weights, 0, + n_node_samples - 1) for p in range(start, end): sum_weights += weights[p] - sort_values_and_weights(y_vals, weights, 0, n_node_samples - 1) sum = sum_weights - weights[0] while(sum > sum_weights/2): - median_index +=1 + median_index += 1 sum -= weights[median_index] if sum == sum_weights/2: @@ -97,12 +110,11 @@ cdef void compute_weighted_median(double* median_dest, double* y_vals, median_dest[k] = y_vals[median_index] - -cdef void sort_values_and_weights(double* y_vals, double* weights, +cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, SIZE_t low, SIZE_t high) nogil: """Sort an array and its weights""" cdef SIZE_t pivot, i, j, - cdef double temp + cdef DOUBLE_t temp if low < high: pivot = low i = low @@ -127,6 +139,7 @@ cdef void sort_values_and_weights(double* y_vals, double* weights, temp = weights[j] weights[j] = weights[pivot] weights[pivot] = temp + sort_values_and_weights(y_vals, weights, low, j-1) sort_values_and_weights(y_vals, weights, j+1, high) From ca5149aecef4440ecb699347ee679cea9da6bee4 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 19 May 2016 01:12:23 -0700 Subject: [PATCH 13/75] fix: fix segfault in median calculation and remove excessive logging --- sklearn/tree/_criterion.pyx | 13 ------------- sklearn/tree/_splitter.pyx | 5 ----- sklearn/tree/_tree.pyx | 4 ---- sklearn/tree/_utils.pyx | 20 ++++++++------------ sklearn/tree/tree.py | 1 - 5 files changed, 8 insertions(+), 35 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index c31882fc7b838..7b2b33cba5768 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -39,11 +39,9 @@ cdef class Criterion: def __dealloc__(self): """Destructor.""" - print "entered criterion dealloc" free(self.sum_total) free(self.sum_left) free(self.sum_right) - print "exited criterion dealloc" def __getstate__(self): return {} @@ -274,7 +272,6 @@ cdef class ClassificationCriterion(Criterion): def __dealloc__(self): """Destructor.""" - print "entered classificationcriterion dealloc" free(self.n_classes) def __reduce__(self): @@ -980,8 +977,6 @@ cdef class MAE(RegressionCriterion): cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end]""" - with gil: - print "entered node_impurity" cdef double* medians = NULL medians = calloc(self.n_outputs, sizeof(double)) if (medians == NULL): @@ -997,8 +992,6 @@ cdef class MAE(RegressionCriterion): i = samples[p] y_ik = self.y[i * self.y_stride + k] impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples) - with gil: - print "impurity / self.n_outputs = {} / {} = {}".format(impurity, self.n_outputs, impurity / self.n_outputs) return impurity / self.n_outputs # cdef double proxy_impurity_improvement(self) nogil: @@ -1052,12 +1045,6 @@ cdef class MAE(RegressionCriterion): impurity_left[0] += fabs(y_ik - medians[k]) / (pos - start) impurity_right[0] = impurity_total - impurity_left[0] - with gil: - # print "start: {}".format(start) - # print "pos: {}".format(pos) - # print "end: {}".format(end) - print "impurity_left[0]: {}".format(impurity_left[0]) - print "impurity_right[0]: {}".format(impurity_right[0]) cdef class FriedmanMSE(MSE): diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index c2ea129a86f25..8d8bf1f985bf9 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -104,12 +104,10 @@ cdef class Splitter: def __dealloc__(self): """Destructor.""" - print "entered splitter dealloc" free(self.samples) free(self.features) free(self.constant_features) free(self.feature_values) - print "exited splitter dealloc" def __getstate__(self): return {} @@ -254,11 +252,9 @@ cdef class BaseDenseSplitter(Splitter): self.presort = presort def __dealloc__(self): - print "entered basedensesplitter dealloc" """Destructor.""" if self.presort == 1: free(self.sample_mask) - print "exited basedensesplitter dealloc" cdef void init(self, object X, @@ -864,7 +860,6 @@ cdef class BaseSparseSplitter(Splitter): self.sorted_samples = NULL def __dealloc__(self): - print "entered basesparsesplitter dealloc" """Deallocate memory.""" free(self.index_to_samples) free(self.sorted_samples) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index d0f384589a626..1a3c5877e75ba 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -264,7 +264,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): tree.max_depth = max_depth_seen if rc == -1: raise MemoryError() - print "dont building" # Best first builder ---------------------------------------------------------- @@ -603,7 +602,6 @@ cdef class Tree: def __dealloc__(self): """Destructor.""" - print "entered tree dealloc" # Free all inner structures free(self.n_classes) free(self.value) @@ -799,7 +797,6 @@ cdef class Tree: cdef inline np.ndarray _apply_sparse_csr(self, object X): """Finds the terminal region (=leaf node) for each sample in sparse X. """ - print "entered _apply_sparse_csr in tree.pyx" # Check input if not isinstance(X, csr_matrix): raise ValueError("X should be in csr_matrix format, got %s" @@ -941,7 +938,6 @@ cdef class Tree: cdef inline object _decision_path_sparse_csr(self, object X): """Finds the decision path (=node) for each sample in X.""" - print "entered _decision_path_sparse_csr in tree.pyx" # Check input if not isinstance(X, csr_matrix): raise ValueError("X should be in csr_matrix format, got %s" diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 5afbb1051924a..020812773630c 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -47,7 +47,6 @@ def _realloc_test(): cdef SIZE_t* p = NULL safe_realloc(&p, (-1) / 2) if p != NULL: - print "entered free in dealloc test" free(p) assert False @@ -67,7 +66,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, cdef SIZE_t i, p, k cdef DOUBLE_t sum_weights cdef SIZE_t median_index - cdef DOUBLE_t sum + cdef DOUBLE_t running_sum cdef DOUBLE_t* y_vals = NULL cdef DOUBLE_t* weights = NULL @@ -80,11 +79,11 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, with gil: raise MemoryError() - for k in range(n_outputs): median_index = 0 + # median_index = start sum_weights = 0.0 - for p in range(start,end): + for p in range(0, n_node_samples): i = samples[p] y_ik = y[i * y_stride + k] @@ -95,16 +94,16 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, y_vals[p] = y_ik sort_values_and_weights(y_vals, weights, 0, n_node_samples - 1) - for p in range(start, end): + for p in range(0, n_node_samples): sum_weights += weights[p] - sum = sum_weights - weights[0] + running_sum = sum_weights - weights[0] - while(sum > sum_weights/2): + while(running_sum > sum_weights/2): median_index += 1 - sum -= weights[median_index] + running_sum -= weights[median_index] - if sum == sum_weights/2: + if running_sum == sum_weights/2: median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 else: median_dest[k] = y_vals[median_index] @@ -206,9 +205,7 @@ cdef class Stack: raise MemoryError() def __dealloc__(self): - print "entered dealloc in stack" free(self.stack_) - print "exited dealloc in stack" cdef bint is_empty(self) nogil: return self.top <= 0 @@ -330,7 +327,6 @@ cdef class PriorityHeap: raise MemoryError() def __dealloc__(self): - print "entered dealloc in priorityheap" free(self.heap_) cdef bint is_empty(self) nogil: diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index d61fe54fa1198..a1e3f0cfecdbc 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -371,7 +371,6 @@ def fit(self, X, y, sample_weight=None, check_input=True, if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] - print "done fitting" return self def _validate_X_predict(self, X, check_input): From 1e5a969e9f0545765d2f2c4b6c29cab1a4744d09 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 19 May 2016 01:20:07 -0700 Subject: [PATCH 14/75] chore: revert some misc spacing changes I accidentally made --- sklearn/tree/_criterion.pyx | 4 ++-- sklearn/tree/_tree.pyx | 2 ++ sklearn/tree/tree.py | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 7b2b33cba5768..96af724d0680e 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -39,6 +39,7 @@ cdef class Criterion: def __dealloc__(self): """Destructor.""" + free(self.sum_total) free(self.sum_left) free(self.sum_right) @@ -169,7 +170,6 @@ cdef class Criterion: return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) - cdef double impurity_improvement(self, double impurity) nogil: """Placeholder for improvement in impurity after a split. @@ -272,6 +272,7 @@ cdef class ClassificationCriterion(Criterion): def __dealloc__(self): """Destructor.""" + free(self.n_classes) def __reduce__(self): @@ -723,7 +724,6 @@ cdef class RegressionCriterion(Criterion): self.sum_left = calloc(n_outputs, sizeof(double)) self.sum_right = calloc(n_outputs, sizeof(double)) - if (self.sum_total == NULL or self.sum_left == NULL or self.sum_right == NULL): diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 1a3c5877e75ba..f44320a7b47ae 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -265,6 +265,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if rc == -1: raise MemoryError() + # Best first builder ---------------------------------------------------------- cdef inline int _add_to_frontier(PriorityHeapRecord* rec, @@ -938,6 +939,7 @@ cdef class Tree: cdef inline object _decision_path_sparse_csr(self, object X): """Finds the decision path (=node) for each sample in X.""" + # Check input if not isinstance(X, csr_matrix): raise ValueError("X should be in csr_matrix format, got %s" diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index a1e3f0cfecdbc..4f8ebf9e960ed 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -371,6 +371,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] + return self def _validate_X_predict(self, X, check_input): From 99132ace58900c34a7347e12e732edec2db65ab0 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 19 May 2016 01:21:34 -0700 Subject: [PATCH 15/75] chore: one last spacing fix in _splitter.pyx --- sklearn/tree/_splitter.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 8d8bf1f985bf9..0617508aab236 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -104,6 +104,7 @@ cdef class Splitter: def __dealloc__(self): """Destructor.""" + free(self.samples) free(self.features) free(self.constant_features) From 9655fb071bdd6b4d93643748a2e1f73a527ef18b Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 19 May 2016 14:10:44 -0700 Subject: [PATCH 16/75] feature: don't calculate weighted median if no weights are passed in --- sklearn/tree/_utils.pyx | 54 ++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 020812773630c..c49584894b055 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -72,16 +72,18 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, cdef DOUBLE_t* weights = NULL y_vals = calloc(n_node_samples, sizeof(DOUBLE_t)) - weights = calloc(n_node_samples, - sizeof(DOUBLE_t)) + if sample_weight != NULL: + with gil: + print "made weights arry" + weights = calloc(n_node_samples, + sizeof(DOUBLE_t)) - if (y_vals == NULL or weights == NULL): + if (y_vals == NULL or (weights == NULL and sample_weight != NULL)): with gil: raise MemoryError() for k in range(n_outputs): median_index = 0 - # median_index = start sum_weights = 0.0 for p in range(0, n_node_samples): i = samples[p] @@ -89,25 +91,32 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, y_ik = y[i * y_stride + k] if sample_weight != NULL: w = sample_weight[i] + weights[p] = w - weights[p] = w y_vals[p] = y_ik sort_values_and_weights(y_vals, weights, 0, n_node_samples - 1) - for p in range(0, n_node_samples): - sum_weights += weights[p] + if sample_weight != NULL: + # calculate the weighted median + for p in range(0, n_node_samples): + sum_weights += weights[p] - running_sum = sum_weights - weights[0] + running_sum = sum_weights - weights[0] - while(running_sum > sum_weights/2): - median_index += 1 - running_sum -= weights[median_index] + while(running_sum > sum_weights/2): + median_index += 1 + running_sum -= weights[median_index] - if running_sum == sum_weights/2: - median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 + if running_sum == sum_weights/2: + median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 + else: + median_dest[k] = y_vals[median_index] else: - median_dest[k] = y_vals[median_index] - + # calculate the unweighted median + if n_node_samples % 2 == 0: + median_dest[k] = (y_vals[n_node_samples / 2] + y_vals[(n_node_samples / 2) - 1])/2 + else: + median_dest[k] = y_vals[n_node_samples / 2] cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, SIZE_t low, SIZE_t high) nogil: @@ -127,17 +136,18 @@ cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, temp = y_vals[i] y_vals[i] = y_vals[j] y_vals[j] = temp - - temp = weights[i] - weights[i] = weights[j] - weights[j] = temp + if weights != NULL: + temp = weights[i] + weights[i] = weights[j] + weights[j] = temp temp = y_vals[j] y_vals[j] = y_vals[pivot] y_vals[pivot] = temp - temp = weights[j] - weights[j] = weights[pivot] - weights[pivot] = temp + if weights != NULL: + temp = weights[j] + weights[j] = weights[pivot] + weights[pivot] = temp sort_values_and_weights(y_vals, weights, low, j-1) sort_values_and_weights(y_vals, weights, j+1, high) From e0476b97b439dfb5684fded4636b4a36c5852ea1 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 19 May 2016 17:30:07 -0700 Subject: [PATCH 17/75] remove extraneous logging statement --- sklearn/tree/_utils.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index c49584894b055..6f747967c6c49 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -72,9 +72,8 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, cdef DOUBLE_t* weights = NULL y_vals = calloc(n_node_samples, sizeof(DOUBLE_t)) + if sample_weight != NULL: - with gil: - print "made weights arry" weights = calloc(n_node_samples, sizeof(DOUBLE_t)) @@ -85,6 +84,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, for k in range(n_outputs): median_index = 0 sum_weights = 0.0 + for p in range(0, n_node_samples): i = samples[p] @@ -94,6 +94,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, weights[p] = w y_vals[p] = y_ik + sort_values_and_weights(y_vals, weights, 0, n_node_samples - 1) if sample_weight != NULL: @@ -118,6 +119,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, else: median_dest[k] = y_vals[n_node_samples / 2] + cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, SIZE_t low, SIZE_t high) nogil: """Sort an array and its weights""" From 04dfc7ee66c2e3aad9ec13990af57e3c1c5fa748 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 19 May 2016 20:34:33 -0700 Subject: [PATCH 18/75] fix: fix children impurity calculation --- sklearn/tree/_criterion.pyx | 20 ++++++++++++-------- sklearn/tree/_utils.pyx | 2 +- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 96af724d0680e..e42a08006af26 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -888,7 +888,6 @@ cdef class MSE(RegressionCriterion): impurity = self.sq_sum_total / self.weighted_n_node_samples for k in range(self.n_outputs): impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 - return impurity / self.n_outputs cdef double proxy_impurity_improvement(self) nogil: @@ -970,7 +969,6 @@ cdef class MAE(RegressionCriterion): """Computes the node value of samples[start:end] into dest.""" cdef SIZE_t start = self.start cdef SIZE_t end = self.end - compute_weighted_median(dest, start, end, self.sample_weight, self.y, self.samples, self.y_stride, self.n_outputs) @@ -1021,8 +1019,6 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end - cdef double impurity_total = self.node_impurity() - cdef SIZE_t i, p, k cdef DOUBLE_t y_ik @@ -1032,8 +1028,7 @@ cdef class MAE(RegressionCriterion): if (medians == NULL): with gil: raise MemoryError() - for k in range(self.n_outputs): - medians[k] = k + compute_weighted_median(medians, start, pos, self.sample_weight, self.y, self.samples, self.y_stride, self.n_outputs) @@ -1042,9 +1037,18 @@ cdef class MAE(RegressionCriterion): for p in range(start, pos): i = samples[p] y_ik = y[i * self.y_stride + k] - impurity_left[0] += fabs(y_ik - medians[k]) / (pos - start) + impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start)) + impurity_left[0] /= self.n_outputs - impurity_right[0] = impurity_total - impurity_left[0] + compute_weighted_median(medians, pos, end, self.sample_weight, + self.y, self.samples, self.y_stride, + self.n_outputs) + for k in range(self.n_outputs): + for p in range(pos, end): + i = samples[p] + y_ik = y[i * self.y_stride + k] + impurity_right[0] += (fabs(y_ik - medians[k]) / (end - pos)) + impurity_right[0] /= self.n_outputs cdef class FriedmanMSE(MSE): diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 6f747967c6c49..78998b36f39e3 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -86,7 +86,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, sum_weights = 0.0 for p in range(0, n_node_samples): - i = samples[p] + i = samples[p + start] y_ik = y[i * y_stride + k] if sample_weight != NULL: From a61782f80ca61517fb5cbea5d7efb2385f8f0716 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 19 May 2016 23:46:25 -0700 Subject: [PATCH 19/75] fix: fix bug with children impurity not being initally set to 0 --- sklearn/tree/_criterion.pyx | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index e42a08006af26..9e2f648c23f0e 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -990,6 +990,8 @@ cdef class MAE(RegressionCriterion): i = samples[p] y_ik = self.y[i * self.y_stride + k] impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples) + with gil: + print impurity / self.n_outputs return impurity / self.n_outputs # cdef double proxy_impurity_improvement(self) nogil: @@ -1021,6 +1023,7 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i, p, k cdef DOUBLE_t y_ik + cdef double test cdef double* medians = NULL @@ -1032,12 +1035,17 @@ cdef class MAE(RegressionCriterion): compute_weighted_median(medians, start, pos, self.sample_weight, self.y, self.samples, self.y_stride, self.n_outputs) - + impurity_left[0] = 0.0 + impurity_right[0] = 0.0 for k in range(self.n_outputs): for p in range(start, pos): i = samples[p] y_ik = y[i * self.y_stride + k] - impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start)) + test = (fabs(y_ik - medians[k]) / (pos-start)) + # with gil: + # print test + # impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start)) + impurity_left[0] += test impurity_left[0] /= self.n_outputs compute_weighted_median(medians, pos, end, self.sample_weight, From 33af0fbbbaa0793c5f6e4ca051753065a1b702c7 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 21 May 2016 13:38:41 -0700 Subject: [PATCH 20/75] fix: hacky fix for a float accuracy error --- sklearn/tree/_criterion.pyx | 23 ++++++++++------------- sklearn/tree/_splitter.pyx | 8 +++++++- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 9e2f648c23f0e..15177bcd7be95 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -166,7 +166,6 @@ cdef class Criterion: cdef double impurity_left cdef double impurity_right self.children_impurity(&impurity_left, &impurity_right) - return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) @@ -198,7 +197,6 @@ cdef class Criterion: cdef double impurity_right self.children_impurity(&impurity_left, &impurity_right) - return ((self.weighted_n_node_samples / self.weighted_n_samples) * (impurity - (self.weighted_n_right / self.weighted_n_node_samples * impurity_right) @@ -821,7 +819,6 @@ cdef class RegressionCriterion(Criterion): # and that sum_total is known, we are going to update # sum_left from the direction that require the least amount # of computations, i.e. from pos to new_pos or from end to new_po. - if (new_pos - pos) <= (end - new_pos): for p in range(pos, new_pos): i = samples[p] @@ -971,6 +968,7 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t end = self.end compute_weighted_median(dest, start, end, self.sample_weight, self.y, self.samples, self.y_stride, self.n_outputs) + cdef SIZE_t i cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of @@ -990,8 +988,6 @@ cdef class MAE(RegressionCriterion): i = samples[p] y_ik = self.y[i * self.y_stride + k] impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples) - with gil: - print impurity / self.n_outputs return impurity / self.n_outputs # cdef double proxy_impurity_improvement(self) nogil: @@ -1023,7 +1019,6 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i, p, k cdef DOUBLE_t y_ik - cdef double test cdef double* medians = NULL @@ -1037,16 +1032,16 @@ cdef class MAE(RegressionCriterion): self.n_outputs) impurity_left[0] = 0.0 impurity_right[0] = 0.0 + for k in range(self.n_outputs): for p in range(start, pos): i = samples[p] y_ik = y[i * self.y_stride + k] - test = (fabs(y_ik - medians[k]) / (pos-start)) - # with gil: - # print test # impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start)) - impurity_left[0] += test - impurity_left[0] /= self.n_outputs + impurity_left[0] += fabs(y_ik - medians[k]) + # impurity_left[0] /= self.n_outputs + impurity_left[0] /= ((pos - start) * self.n_outputs) + compute_weighted_median(medians, pos, end, self.sample_weight, self.y, self.samples, self.y_stride, @@ -1055,8 +1050,10 @@ cdef class MAE(RegressionCriterion): for p in range(pos, end): i = samples[p] y_ik = y[i * self.y_stride + k] - impurity_right[0] += (fabs(y_ik - medians[k]) / (end - pos)) - impurity_right[0] /= self.n_outputs + # impurity_right[0] += (fabs(y_ik - medians[k]) / (end - pos)) + impurity_right[0] += fabs(y_ik - medians[k]) + # impurity_right[0] /= self.n_outputs + impurity_right[0] /= ((end - pos) * self.n_outputs) cdef class FriedmanMSE(MSE): diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 0617508aab236..31848546d8732 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -458,7 +458,6 @@ cdef class BestSplitter(BaseDenseSplitter): continue current_proxy_improvement = self.criterion.proxy_impurity_improvement() - if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement current.threshold = (Xf[p - 1] + Xf[p]) / 2.0 @@ -486,6 +485,7 @@ cdef class BestSplitter(BaseDenseSplitter): samples[p] = tmp self.criterion.reset() + self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, @@ -782,6 +782,7 @@ cdef class RandomSplitter(BaseDenseSplitter): # Evaluate split self.criterion.reset() + self.criterion.update(current.pos) # Reject if min_weight_leaf is not satisfied @@ -815,6 +816,7 @@ cdef class RandomSplitter(BaseDenseSplitter): self.criterion.reset() + self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, @@ -1354,6 +1356,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter): best = current # Reorganize into samples[start:best.pos] + samples[best.pos:end] + if best.pos < end: self.extract_nnz(best.feature, &end_negative, &start_positive, &is_samples_sorted) @@ -1362,6 +1365,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter): best.pos) self.criterion.reset() + self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, @@ -1563,6 +1567,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): # Evaluate split self.criterion.reset() + self.criterion.update(current.pos) # Reject if min_weight_leaf is not satisfied @@ -1590,6 +1595,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): best.pos) self.criterion.reset() + self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, From 5844d810a29b5e83d50033431d3eddc6e895d0a5 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 21 May 2016 22:44:38 -0700 Subject: [PATCH 21/75] fix: incorrect type cast in median array generation for node_impurity --- sklearn/tree/_criterion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 15177bcd7be95..0b89753af45a7 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -974,7 +974,7 @@ cdef class MAE(RegressionCriterion): """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end]""" cdef double* medians = NULL - medians = calloc(self.n_outputs, sizeof(double)) + medians = calloc(self.n_outputs, sizeof(double)) if (medians == NULL): with gil: raise MemoryError() From 134eb9250ee015dbdd2516f33fe2c7a7c2ed69a9 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 22 May 2016 00:14:38 -0700 Subject: [PATCH 22/75] slightly tweak node_impurity function --- sklearn/tree/_criterion.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 0b89753af45a7..e6453b9a4e1c4 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -968,7 +968,6 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t end = self.end compute_weighted_median(dest, start, end, self.sample_weight, self.y, self.samples, self.y_stride, self.n_outputs) - cdef SIZE_t i cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of @@ -987,8 +986,10 @@ cdef class MAE(RegressionCriterion): for p in range(self.start, self.end): i = samples[p] y_ik = self.y[i * self.y_stride + k] - impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples) - return impurity / self.n_outputs + # impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples) + impurity += fabs(y_ik - medians[k]) + # return impurity / self.n_outputs + return impurity / (self.weighted_n_node_samples * self.n_outputs) # cdef double proxy_impurity_improvement(self) nogil: # """Compute a proxy of the impurity reduction From 115df19f5cdfde4a14d435d19b1f37dc4f58e72b Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 22 May 2016 12:26:19 -0700 Subject: [PATCH 23/75] fix: be more explicit with casts --- sklearn/tree/_criterion.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index e6453b9a4e1c4..adade6eba2fd9 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -987,7 +987,7 @@ cdef class MAE(RegressionCriterion): i = samples[p] y_ik = self.y[i * self.y_stride + k] # impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples) - impurity += fabs(y_ik - medians[k]) + impurity += fabs(( y_ik) - medians[k]) # return impurity / self.n_outputs return impurity / (self.weighted_n_node_samples * self.n_outputs) @@ -1039,9 +1039,9 @@ cdef class MAE(RegressionCriterion): i = samples[p] y_ik = y[i * self.y_stride + k] # impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start)) - impurity_left[0] += fabs(y_ik - medians[k]) + impurity_left[0] += fabs((y_ik) - medians[k]) # impurity_left[0] /= self.n_outputs - impurity_left[0] /= ((pos - start) * self.n_outputs) + impurity_left[0] /= ((pos - start) * self.n_outputs) compute_weighted_median(medians, pos, end, self.sample_weight, @@ -1052,9 +1052,9 @@ cdef class MAE(RegressionCriterion): i = samples[p] y_ik = y[i * self.y_stride + k] # impurity_right[0] += (fabs(y_ik - medians[k]) / (end - pos)) - impurity_right[0] += fabs(y_ik - medians[k]) + impurity_right[0] += fabs((y_ik) - medians[k]) # impurity_right[0] /= self.n_outputs - impurity_right[0] /= ((end - pos) * self.n_outputs) + impurity_right[0] /= ((end - pos) * self.n_outputs) cdef class FriedmanMSE(MSE): From 6fa918fc0a7fe4f2a3d673ff69e5a2bf4fd069d1 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Fri, 27 May 2016 15:43:37 -0700 Subject: [PATCH 24/75] feature: revert cosmetic changes and free temporary arrays --- sklearn/tree/_criterion.pxd | 2 +- sklearn/tree/_criterion.pyx | 17 +++++++++++------ sklearn/tree/_splitter.pyx | 7 ------- sklearn/tree/_utils.pyx | 4 +++- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 172d57659e6a6..889a623d732b3 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -44,7 +44,7 @@ cdef class Criterion: # weighted count of each label. For regression, # the sum of w*y. sum_total[k] is equal to # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k], - # where k is output index. + # where k is output index. cdef double* sum_left # Same as above, but for the left side of the split cdef double* sum_right # same as above, but for the right side of the split diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index adade6eba2fd9..98451ff2c00b5 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -166,6 +166,7 @@ cdef class Criterion: cdef double impurity_left cdef double impurity_right self.children_impurity(&impurity_left, &impurity_right) + return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) @@ -198,9 +199,9 @@ cdef class Criterion: self.children_impurity(&impurity_left, &impurity_right) return ((self.weighted_n_node_samples / self.weighted_n_samples) * - (impurity - (self.weighted_n_right / + (impurity - (self.weighted_n_right / self.weighted_n_node_samples * impurity_right) - - (self.weighted_n_left / + - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) @@ -263,7 +264,7 @@ cdef class ClassificationCriterion(Criterion): self.sum_left = calloc(n_elements, sizeof(double)) self.sum_right = calloc(n_elements, sizeof(double)) - if (self.sum_total == NULL or + if (self.sum_total == NULL or self.sum_left == NULL or self.sum_right == NULL): raise MemoryError() @@ -819,6 +820,7 @@ cdef class RegressionCriterion(Criterion): # and that sum_total is known, we are going to update # sum_left from the direction that require the least amount # of computations, i.e. from pos to new_pos or from end to new_po. + if (new_pos - pos) <= (end - new_pos): for p in range(pos, new_pos): i = samples[p] @@ -846,7 +848,7 @@ cdef class RegressionCriterion(Criterion): self.weighted_n_left -= w - self.weighted_n_right = (self.weighted_n_node_samples - + self.weighted_n_right = (self.weighted_n_node_samples - self.weighted_n_left) for k in range(self.n_outputs): sum_right[k] = sum_total[k] - sum_left[k] @@ -885,6 +887,7 @@ cdef class MSE(RegressionCriterion): impurity = self.sq_sum_total / self.weighted_n_node_samples for k in range(self.n_outputs): impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + return impurity / self.n_outputs cdef double proxy_impurity_improvement(self) nogil: @@ -919,6 +922,7 @@ cdef class MSE(RegressionCriterion): left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" + cdef DOUBLE_t* y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -960,8 +964,7 @@ cdef class MSE(RegressionCriterion): impurity_right[0] /= self.n_outputs cdef class MAE(RegressionCriterion): - """Mean absolute error impurity criterion - """ + """Mean absolute error impurity criterion""" cdef void node_value(self, double* dest) nogil: """Computes the node value of samples[start:end] into dest.""" cdef SIZE_t start = self.start @@ -989,6 +992,7 @@ cdef class MAE(RegressionCriterion): # impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples) impurity += fabs(( y_ik) - medians[k]) # return impurity / self.n_outputs + free(medians) return impurity / (self.weighted_n_node_samples * self.n_outputs) # cdef double proxy_impurity_improvement(self) nogil: @@ -1055,6 +1059,7 @@ cdef class MAE(RegressionCriterion): impurity_right[0] += fabs((y_ik) - medians[k]) # impurity_right[0] /= self.n_outputs impurity_right[0] /= ((end - pos) * self.n_outputs) + free(medians) cdef class FriedmanMSE(MSE): diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 31848546d8732..39c17fc1ca55e 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -485,7 +485,6 @@ cdef class BestSplitter(BaseDenseSplitter): samples[p] = tmp self.criterion.reset() - self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, @@ -782,7 +781,6 @@ cdef class RandomSplitter(BaseDenseSplitter): # Evaluate split self.criterion.reset() - self.criterion.update(current.pos) # Reject if min_weight_leaf is not satisfied @@ -816,7 +814,6 @@ cdef class RandomSplitter(BaseDenseSplitter): self.criterion.reset() - self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, @@ -1356,7 +1353,6 @@ cdef class BestSparseSplitter(BaseSparseSplitter): best = current # Reorganize into samples[start:best.pos] + samples[best.pos:end] - if best.pos < end: self.extract_nnz(best.feature, &end_negative, &start_positive, &is_samples_sorted) @@ -1365,7 +1361,6 @@ cdef class BestSparseSplitter(BaseSparseSplitter): best.pos) self.criterion.reset() - self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, @@ -1567,7 +1562,6 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): # Evaluate split self.criterion.reset() - self.criterion.update(current.pos) # Reject if min_weight_leaf is not satisfied @@ -1595,7 +1589,6 @@ cdef class RandomSparseSplitter(BaseSparseSplitter): best.pos) self.criterion.reset() - self.criterion.update(best.pos) best.improvement = self.criterion.impurity_improvement(impurity) self.criterion.children_impurity(&best.impurity_left, diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 78998b36f39e3..bf6d8bccf99d0 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -85,7 +85,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, median_index = 0 sum_weights = 0.0 - for p in range(0, n_node_samples): + for p in range(n_node_samples): i = samples[p + start] y_ik = y[i * y_stride + k] @@ -118,6 +118,8 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, median_dest[k] = (y_vals[n_node_samples / 2] + y_vals[(n_node_samples / 2) - 1])/2 else: median_dest[k] = y_vals[n_node_samples / 2] + free(y_vals) + free(weights) cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, From 8d005941a3b8b9f0cb88398040bef805735ccfff Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Fri, 27 May 2016 15:54:53 -0700 Subject: [PATCH 25/75] fix: only free weight array in median calcuation if it was created --- sklearn/tree/_criterion.pyx | 7 ++++--- sklearn/tree/_splitter.pyx | 1 + sklearn/tree/_utils.pyx | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 98451ff2c00b5..f715d6f5e4e50 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -198,6 +198,7 @@ cdef class Criterion: cdef double impurity_right self.children_impurity(&impurity_left, &impurity_right) + return ((self.weighted_n_node_samples / self.weighted_n_samples) * (impurity - (self.weighted_n_right / self.weighted_n_node_samples * impurity_right) @@ -723,7 +724,7 @@ cdef class RegressionCriterion(Criterion): self.sum_left = calloc(n_outputs, sizeof(double)) self.sum_right = calloc(n_outputs, sizeof(double)) - if (self.sum_total == NULL or + if (self.sum_total == NULL or self.sum_left == NULL or self.sum_right == NULL): raise MemoryError() @@ -958,7 +959,7 @@ cdef class MSE(RegressionCriterion): for k in range(self.n_outputs): impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 impurity_left[0] /= self.n_outputs impurity_right[0] /= self.n_outputs @@ -1118,5 +1119,5 @@ cdef class FriedmanMSE(MSE): diff = (self.weighted_n_right * total_sum_left - self.weighted_n_left * total_sum_right) / self.n_outputs - return (diff * diff / (self.weighted_n_left * self.weighted_n_right * + return (diff * diff / (self.weighted_n_left * self.weighted_n_right * self.weighted_n_node_samples)) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 39c17fc1ca55e..0617508aab236 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -458,6 +458,7 @@ cdef class BestSplitter(BaseDenseSplitter): continue current_proxy_improvement = self.criterion.proxy_impurity_improvement() + if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement current.threshold = (Xf[p - 1] + Xf[p]) / 2.0 diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index bf6d8bccf99d0..8d6da1f4e82c8 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -119,7 +119,8 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, else: median_dest[k] = y_vals[n_node_samples / 2] free(y_vals) - free(weights) + if sample_weight != NULL: + free(weights) cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, From d51e09184b661662a085aa8ea1c049826638b990 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Fri, 27 May 2016 18:49:42 -0700 Subject: [PATCH 26/75] style: remove extraneous newline / trigger CI build --- sklearn/tree/_utils.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 8d6da1f4e82c8..c2823ddbeeac6 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -16,7 +16,6 @@ from libc.stdlib cimport calloc from libc.stdlib cimport realloc from libc.math cimport log as ln - import numpy as np cimport numpy as np np.import_array() From a9ccf188a52e861b1c530c5c78970b172cce5a20 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Fri, 27 May 2016 18:54:37 -0700 Subject: [PATCH 27/75] style: remove extraneous 0 from range --- sklearn/tree/_utils.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index c2823ddbeeac6..843b3e460859a 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -85,7 +85,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, sum_weights = 0.0 for p in range(n_node_samples): - i = samples[p + start] + i = samples[start + p] y_ik = y[i * y_stride + k] if sample_weight != NULL: @@ -98,7 +98,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, n_node_samples - 1) if sample_weight != NULL: # calculate the weighted median - for p in range(0, n_node_samples): + for p in range(n_node_samples): sum_weights += weights[p] running_sum = sum_weights - weights[0] From f46b3c2fcd067fac73a105b1724cbc56fb12e07a Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 29 May 2016 03:46:44 -0700 Subject: [PATCH 28/75] feature: save sorts within a node to speed it up --- sklearn/tree/_criterion.pxd | 2 ++ sklearn/tree/_criterion.pyx | 44 +++++++++++++++++++--------- sklearn/tree/_utils.pxd | 6 ++-- sklearn/tree/_utils.pyx | 58 ++++++++++--------------------------- 4 files changed, 51 insertions(+), 59 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 889a623d732b3..a6a4f10885b43 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -47,6 +47,8 @@ cdef class Criterion: # where k is output index. cdef double* sum_left # Same as above, but for the left side of the split cdef double* sum_right # same as above, but for the right side of the split + cdef DOUBLE_t* coupled_sorted_y + cdef DOUBLE_t* coupled_sorted_weights # The criterion object is maintained such that left and right collected # statistics correspond to samples[start:pos] and samples[pos:end]. diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index f715d6f5e4e50..479c2bc8b9794 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -29,6 +29,7 @@ from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray from ._utils cimport compute_weighted_median +from ._utils cimport sort_values_and_weights cdef class Criterion: """Interface for impurity criteria. @@ -43,6 +44,8 @@ cdef class Criterion: free(self.sum_total) free(self.sum_left) free(self.sum_right) + free(self.coupled_sorted_y) + free(self.coupled_sorted_weights) def __getstate__(self): return {} @@ -718,6 +721,8 @@ cdef class RegressionCriterion(Criterion): self.sum_total = NULL self.sum_left = NULL self.sum_right = NULL + self.coupled_sorted_y = NULL + self.coupled_sorted_weights = NULL # Allocate memory for the accumulators self.sum_total = calloc(n_outputs, sizeof(double)) @@ -755,6 +760,16 @@ cdef class RegressionCriterion(Criterion): cdef DOUBLE_t w_y_ik cdef DOUBLE_t w = 1.0 + self.coupled_sorted_y = calloc(self.n_node_samples, + sizeof(DOUBLE_t)) + if sample_weight != NULL: + self.coupled_sorted_weights = calloc(self.n_node_samples, + sizeof(DOUBLE_t)) + if(self.coupled_sorted_y == NULL or + (self.coupled_sorted_weights == NULL and sample_weight != NULL)): + with gil: + raise MemoryError() + self.sq_sum_total = 0.0 memset(self.sum_total, 0, self.n_outputs * sizeof(double)) @@ -763,15 +778,20 @@ cdef class RegressionCriterion(Criterion): if sample_weight != NULL: w = sample_weight[i] + self.coupled_sorted_weights[p - start] = w for k in range(self.n_outputs): y_ik = y[i * y_stride + k] + self.coupled_sorted_y[p - start] = y_ik w_y_ik = w * y_ik self.sum_total[k] += w_y_ik self.sq_sum_total += w_y_ik * y_ik self.weighted_n_node_samples += w + sort_values_and_weights(self.coupled_sorted_y, self.coupled_sorted_weights, + 0, self.n_node_samples-1) + # Reset to pos=start self.reset() @@ -970,8 +990,10 @@ cdef class MAE(RegressionCriterion): """Computes the node value of samples[start:end] into dest.""" cdef SIZE_t start = self.start cdef SIZE_t end = self.end - compute_weighted_median(dest, start, end, self.sample_weight, self.y, - self.samples, self.y_stride, self.n_outputs) + compute_weighted_median(dest, 0, end-start, + self.coupled_sorted_weights, + self.coupled_sorted_y, + self.n_outputs) cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of @@ -990,9 +1012,7 @@ cdef class MAE(RegressionCriterion): for p in range(self.start, self.end): i = samples[p] y_ik = self.y[i * self.y_stride + k] - # impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples) impurity += fabs(( y_ik) - medians[k]) - # return impurity / self.n_outputs free(medians) return impurity / (self.weighted_n_node_samples * self.n_outputs) @@ -1033,8 +1053,9 @@ cdef class MAE(RegressionCriterion): with gil: raise MemoryError() - compute_weighted_median(medians, start, pos, self.sample_weight, - self.y, self.samples, self.y_stride, + compute_weighted_median(medians, 0, pos-start, + self.coupled_sorted_weights, + self.coupled_sorted_y, self.n_outputs) impurity_left[0] = 0.0 impurity_right[0] = 0.0 @@ -1043,22 +1064,17 @@ cdef class MAE(RegressionCriterion): for p in range(start, pos): i = samples[p] y_ik = y[i * self.y_stride + k] - # impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start)) impurity_left[0] += fabs((y_ik) - medians[k]) - # impurity_left[0] /= self.n_outputs impurity_left[0] /= ((pos - start) * self.n_outputs) - - - compute_weighted_median(medians, pos, end, self.sample_weight, - self.y, self.samples, self.y_stride, + compute_weighted_median(medians, pos-start, end-start, + self.coupled_sorted_weights, + self.coupled_sorted_y, self.n_outputs) for k in range(self.n_outputs): for p in range(pos, end): i = samples[p] y_ik = y[i * self.y_stride + k] - # impurity_right[0] += (fabs(y_ik - medians[k]) / (end - pos)) impurity_right[0] += fabs((y_ik) - medians[k]) - # impurity_right[0] /= self.n_outputs impurity_right[0] /= ((end - pos) * self.n_outputs) free(medians) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 0678675ab1175..dbea4a9c60233 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -40,10 +40,12 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except * cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size) cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, - DOUBLE_t* sample_weight, DOUBLE_t* y, - SIZE_t* samples, SIZE_t y_stride, + DOUBLE_t* coupled_sorted_weights, + DOUBLE_t* coupled_sorted_y, SIZE_t n_outputs) nogil +cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, + SIZE_t low, SIZE_t high) nogil cdef SIZE_t rand_int(SIZE_t low, SIZE_t high, UINT32_t* random_state) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 843b3e460859a..74e197e6f9293 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -51,75 +51,47 @@ def _realloc_test(): cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, - DOUBLE_t* sample_weight, DOUBLE_t* y, - SIZE_t* samples, SIZE_t y_stride, + DOUBLE_t* coupled_sorted_weights, + DOUBLE_t* coupled_sorted_y, SIZE_t n_outputs) nogil: """Calculate the weighted median of samples[start:end] and put it into a destination pointer given values, weights, and a start and end index. """ cdef DOUBLE_t w = 1.0 - cdef DOUBLE_t y_ik - cdef SIZE_t n_node_samples = end - start - - cdef SIZE_t i, p, k + cdef SIZE_t p, k cdef DOUBLE_t sum_weights cdef SIZE_t median_index cdef DOUBLE_t running_sum - - cdef DOUBLE_t* y_vals = NULL - cdef DOUBLE_t* weights = NULL - y_vals = calloc(n_node_samples, - sizeof(DOUBLE_t)) - - if sample_weight != NULL: - weights = calloc(n_node_samples, - sizeof(DOUBLE_t)) - - if (y_vals == NULL or (weights == NULL and sample_weight != NULL)): - with gil: - raise MemoryError() + cdef SIZE_t n_node_samples = end - start for k in range(n_outputs): median_index = 0 sum_weights = 0.0 - for p in range(n_node_samples): - i = samples[start + p] - - y_ik = y[i * y_stride + k] - if sample_weight != NULL: - w = sample_weight[i] - weights[p] = w - - y_vals[p] = y_ik - - sort_values_and_weights(y_vals, weights, 0, - n_node_samples - 1) - if sample_weight != NULL: + if coupled_sorted_weights != NULL: # calculate the weighted median for p in range(n_node_samples): - sum_weights += weights[p] + sum_weights += coupled_sorted_weights[p] - running_sum = sum_weights - weights[0] + running_sum = sum_weights - coupled_sorted_weights[0] while(running_sum > sum_weights/2): median_index += 1 - running_sum -= weights[median_index] + running_sum -= coupled_sorted_weights[median_index] if running_sum == sum_weights/2: - median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2 + median_dest[k] = (coupled_sorted_y[median_index] + + coupled_sorted_y[median_index + 1]) / 2 else: - median_dest[k] = y_vals[median_index] + median_dest[k] = coupled_sorted_y[median_index] else: # calculate the unweighted median - if n_node_samples % 2 == 0: - median_dest[k] = (y_vals[n_node_samples / 2] + y_vals[(n_node_samples / 2) - 1])/2 + if (n_node_samples) % 2 == 0: + median_dest[k] = (coupled_sorted_y[(n_node_samples / 2) + start] + + coupled_sorted_y[(n_node_samples / 2) - 1 + start])/2 else: - median_dest[k] = y_vals[n_node_samples / 2] - free(y_vals) - if sample_weight != NULL: - free(weights) + median_dest[k] = coupled_sorted_y[(n_node_samples / 2) + start] cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, From 5635c979dbdc39d619c167ac45d0e2d6e1154f0e Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 29 May 2016 12:17:19 -0700 Subject: [PATCH 29/75] fix: move parts of dealloc to regression criterion --- sklearn/tree/_criterion.pxd | 6 ++++-- sklearn/tree/_criterion.pyx | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index a6a4f10885b43..37e7f055b457a 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -47,8 +47,10 @@ cdef class Criterion: # where k is output index. cdef double* sum_left # Same as above, but for the left side of the split cdef double* sum_right # same as above, but for the right side of the split - cdef DOUBLE_t* coupled_sorted_y - cdef DOUBLE_t* coupled_sorted_weights + cdef DOUBLE_t* coupled_sorted_y # For MAE regression criteria, this stores the + # sorted y values + cdef DOUBLE_t* coupled_sorted_weights # For MAE regression criteria, this stores the + # weights corresponding to the sorted y values # The criterion object is maintained such that left and right collected # statistics correspond to samples[start:pos] and samples[pos:end]. diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 479c2bc8b9794..3130a7353772c 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -44,8 +44,6 @@ cdef class Criterion: free(self.sum_total) free(self.sum_left) free(self.sum_right) - free(self.coupled_sorted_y) - free(self.coupled_sorted_weights) def __getstate__(self): return {} @@ -686,6 +684,10 @@ cdef class RegressionCriterion(Criterion): var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ + def __dealloc__(self): + """Destructor.""" + free(self.coupled_sorted_y) + free(self.coupled_sorted_weights) cdef double sq_sum_total From 97d44e335d0d459d98def2273328afd26fd0d751 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 29 May 2016 12:37:46 -0700 Subject: [PATCH 30/75] chore: add comment to splitter to try to force recythonizing --- sklearn/tree/_splitter.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 0617508aab236..ff808c9c2639e 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -15,6 +15,8 @@ # # License: BSD 3 clause +# adding a new line to attempt to force recythonizing. + from ._criterion cimport Criterion from libc.stdlib cimport free From 58949f78d194dc07c9421e14ccbd1f4947ff5d4a Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 29 May 2016 12:46:36 -0700 Subject: [PATCH 31/75] chore: add comment to _tree.pyx to try to force recythonizing --- sklearn/tree/_tree.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f44320a7b47ae..69697f36c10d4 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -15,6 +15,8 @@ # # License: BSD 3 clause +# adding a comment to try to force recythonization + from cpython cimport Py_INCREF, PyObject from libc.stdlib cimport free From 0be994f89911148f4ade7bd4b98cb81b0080fa37 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 29 May 2016 13:00:26 -0700 Subject: [PATCH 32/75] chore: add empty comment to gradient boosting to force recythonizing --- sklearn/ensemble/_gradient_boosting.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index 9e6e9f6d29c0e..fdd670cf31bc4 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -6,6 +6,8 @@ # # License: BSD 3 clause +# another empty comment to force recythonizing + cimport cython import numpy as np From 492ea7d590b2d3a3eb56229e38322d0c90823a38 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 29 May 2016 14:11:25 -0700 Subject: [PATCH 33/75] fix: fix bug in weighted median --- sklearn/tree/_criterion.pyx | 2 -- sklearn/tree/_utils.pyx | 9 ++++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 3130a7353772c..b7fe0f949c487 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1027,7 +1027,6 @@ cdef class MAE(RegressionCriterion): # The absolute impurity improvement is only computed by the # impurity_improvement method once the best split has been found. # """ - # # todo # pass @@ -1049,7 +1048,6 @@ cdef class MAE(RegressionCriterion): cdef DOUBLE_t y_ik cdef double* medians = NULL - medians = calloc(self.n_outputs, sizeof(double)) if (medians == NULL): with gil: diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 74e197e6f9293..616ad1c1652b7 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -12,7 +12,6 @@ from libc.stdlib cimport free from libc.stdlib cimport malloc -from libc.stdlib cimport calloc from libc.stdlib cimport realloc from libc.math cimport log as ln @@ -81,15 +80,15 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, running_sum -= coupled_sorted_weights[median_index] if running_sum == sum_weights/2: - median_dest[k] = (coupled_sorted_y[median_index] + - coupled_sorted_y[median_index + 1]) / 2 + median_dest[k] = (coupled_sorted_y[median_index + start] + + coupled_sorted_y[median_index + 1 + start]) / 2.0 else: - median_dest[k] = coupled_sorted_y[median_index] + median_dest[k] = coupled_sorted_y[median_index + start] else: # calculate the unweighted median if (n_node_samples) % 2 == 0: median_dest[k] = (coupled_sorted_y[(n_node_samples / 2) + start] + - coupled_sorted_y[(n_node_samples / 2) - 1 + start])/2 + coupled_sorted_y[(n_node_samples / 2) - 1 + start]) / 2.0 else: median_dest[k] = coupled_sorted_y[(n_node_samples / 2) + start] From 00fbe6efabae5c101dcbfcbc6fb87e187d77cecb Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 29 May 2016 14:48:27 -0700 Subject: [PATCH 34/75] try moving sorted values to a class variable --- sklearn/ensemble/_gradient_boosting.pyx | 2 -- sklearn/tree/_criterion.pxd | 4 ++-- sklearn/tree/_criterion.pyx | 2 ++ sklearn/tree/_splitter.pyx | 2 -- sklearn/tree/_tree.pyx | 2 -- 5 files changed, 4 insertions(+), 8 deletions(-) diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index fdd670cf31bc4..9e6e9f6d29c0e 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -6,8 +6,6 @@ # # License: BSD 3 clause -# another empty comment to force recythonizing - cimport cython import numpy as np diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 37e7f055b457a..7320fb184d5e8 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -47,9 +47,9 @@ cdef class Criterion: # where k is output index. cdef double* sum_left # Same as above, but for the left side of the split cdef double* sum_right # same as above, but for the right side of the split - cdef DOUBLE_t* coupled_sorted_y # For MAE regression criteria, this stores the + # cdef DOUBLE_t* coupled_sorted_y # For MAE regression criteria, this stores the # sorted y values - cdef DOUBLE_t* coupled_sorted_weights # For MAE regression criteria, this stores the + # cdef DOUBLE_t* coupled_sorted_weights # For MAE regression criteria, this stores the # weights corresponding to the sorted y values # The criterion object is maintained such that left and right collected diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index b7fe0f949c487..44eeb3350c1b2 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -690,6 +690,8 @@ cdef class RegressionCriterion(Criterion): free(self.coupled_sorted_weights) cdef double sq_sum_total + cdef DOUBLE_t* coupled_sorted_y + cdef DOUBLE_t* coupled_sorted_weights def __cinit__(self, SIZE_t n_outputs): """Initialize parameters for this criterion. diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index ff808c9c2639e..0617508aab236 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -15,8 +15,6 @@ # # License: BSD 3 clause -# adding a new line to attempt to force recythonizing. - from ._criterion cimport Criterion from libc.stdlib cimport free diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 69697f36c10d4..f44320a7b47ae 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -15,8 +15,6 @@ # # License: BSD 3 clause -# adding a comment to try to force recythonization - from cpython cimport Py_INCREF, PyObject from libc.stdlib cimport free From f03cf38f92030a1908c0c99d602b703b30bd3478 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 18 Jun 2016 13:56:45 -0700 Subject: [PATCH 35/75] feature: refactor criterion to sort once initially, then draw all samples from this sorted data --- sklearn/tree/_criterion.pyx | 74 ++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 44eeb3350c1b2..c95acf73cf60a 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -692,6 +692,7 @@ cdef class RegressionCriterion(Criterion): cdef double sq_sum_total cdef DOUBLE_t* coupled_sorted_y cdef DOUBLE_t* coupled_sorted_weights + cdef bint initialized def __cinit__(self, SIZE_t n_outputs): """Initialize parameters for this criterion. @@ -714,6 +715,7 @@ cdef class RegressionCriterion(Criterion): self.n_outputs = n_outputs self.n_node_samples = 0 + self.initialized = 0 self.weighted_n_node_samples = 0.0 self.weighted_n_left = 0.0 self.weighted_n_right = 0.0 @@ -764,37 +766,49 @@ cdef class RegressionCriterion(Criterion): cdef DOUBLE_t w_y_ik cdef DOUBLE_t w = 1.0 - self.coupled_sorted_y = calloc(self.n_node_samples, - sizeof(DOUBLE_t)) - if sample_weight != NULL: - self.coupled_sorted_weights = calloc(self.n_node_samples, - sizeof(DOUBLE_t)) - if(self.coupled_sorted_y == NULL or - (self.coupled_sorted_weights == NULL and sample_weight != NULL)): - with gil: - raise MemoryError() - self.sq_sum_total = 0.0 memset(self.sum_total, 0, self.n_outputs * sizeof(double)) + if self.initialized == 0: + self.coupled_sorted_y = calloc(self.n_node_samples, + sizeof(DOUBLE_t)) + if sample_weight != NULL: + self.coupled_sorted_weights = calloc(self.n_node_samples, + sizeof(DOUBLE_t)) + if(self.coupled_sorted_y == NULL or + (self.coupled_sorted_weights == NULL and sample_weight != NULL)): + with gil: + raise MemoryError() + for p in range(start, end): i = samples[p] if sample_weight != NULL: w = sample_weight[i] - self.coupled_sorted_weights[p - start] = w for k in range(self.n_outputs): y_ik = y[i * y_stride + k] - self.coupled_sorted_y[p - start] = y_ik w_y_ik = w * y_ik self.sum_total[k] += w_y_ik self.sq_sum_total += w_y_ik * y_ik self.weighted_n_node_samples += w - sort_values_and_weights(self.coupled_sorted_y, self.coupled_sorted_weights, - 0, self.n_node_samples-1) + if self.initialized == 0: + for p in range(start, end): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + self.coupled_sorted_weights[p - start] = w + + for k in range(self.n_outputs): + y_ik = y[i * y_stride + k] + self.coupled_sorted_y[p - start] = y_ik + + sort_values_and_weights(self.coupled_sorted_y, self.coupled_sorted_weights, + 0, self.n_node_samples-1) + self.initialized = 1 # Reset to pos=start self.reset() @@ -821,7 +835,6 @@ cdef class RegressionCriterion(Criterion): cdef void update(self, SIZE_t new_pos) nogil: """Updated statistics by moving samples[pos:new_pos] to the left.""" - cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right cdef double* sum_total = self.sum_total @@ -912,7 +925,6 @@ cdef class MSE(RegressionCriterion): impurity = self.sq_sum_total / self.weighted_n_node_samples for k in range(self.n_outputs): impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 - return impurity / self.n_outputs cdef double proxy_impurity_improvement(self) nogil: @@ -937,7 +949,6 @@ cdef class MSE(RegressionCriterion): for k in range(self.n_outputs): proxy_impurity_left += sum_left[k] * sum_left[k] proxy_impurity_right += sum_right[k] * sum_right[k] - return (proxy_impurity_left / self.weighted_n_left + proxy_impurity_right / self.weighted_n_right) @@ -994,7 +1005,7 @@ cdef class MAE(RegressionCriterion): """Computes the node value of samples[start:end] into dest.""" cdef SIZE_t start = self.start cdef SIZE_t end = self.end - compute_weighted_median(dest, 0, end-start, + compute_weighted_median(dest, self.start, self.end, self.coupled_sorted_weights, self.coupled_sorted_y, self.n_outputs) @@ -1014,31 +1025,18 @@ cdef class MAE(RegressionCriterion): self.node_value(medians) for k in range(self.n_outputs): for p in range(self.start, self.end): - i = samples[p] - y_ik = self.y[i * self.y_stride + k] + y_ik = self.coupled_sorted_y[p] impurity += fabs(( y_ik) - medians[k]) free(medians) return impurity / (self.weighted_n_node_samples * self.n_outputs) - # cdef double proxy_impurity_improvement(self) nogil: - # """Compute a proxy of the impurity reduction - # This method is used to speed up the search for the best split. - # It is a proxy quantity such that the split that maximizes this value - # also maximizes the impurity improvement. It neglects all constant terms - # of the impurity decrease for a given split. - # The absolute impurity improvement is only computed by the - # impurity_improvement method once the best split has been found. - # """ - # # todo - # pass - cdef void children_impurity(self, double* impurity_left, double* impurity_right) nogil: """Evaluate the impurity in children nodes, i.e. the impurity of the left child (samples[start:pos]) and the impurity the right child (samples[pos:end]). """ - cdef DOUBLE_t* y = self.y + cdef DOUBLE_t* y = self.coupled_sorted_y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -1055,7 +1053,7 @@ cdef class MAE(RegressionCriterion): with gil: raise MemoryError() - compute_weighted_median(medians, 0, pos-start, + compute_weighted_median(medians, start, pos, self.coupled_sorted_weights, self.coupled_sorted_y, self.n_outputs) @@ -1064,18 +1062,16 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): for p in range(start, pos): - i = samples[p] - y_ik = y[i * self.y_stride + k] + y_ik = y[p] impurity_left[0] += fabs((y_ik) - medians[k]) impurity_left[0] /= ((pos - start) * self.n_outputs) - compute_weighted_median(medians, pos-start, end-start, + compute_weighted_median(medians, pos, end, self.coupled_sorted_weights, self.coupled_sorted_y, self.n_outputs) for k in range(self.n_outputs): for p in range(pos, end): - i = samples[p] - y_ik = y[i * self.y_stride + k] + y_ik = y[p] impurity_right[0] += fabs((y_ik) - medians[k]) impurity_right[0] /= ((end - pos) * self.n_outputs) free(medians) From 2fdb56d5b799323393e5b023a0f534b2b1df0c53 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 18 Jun 2016 13:58:14 -0700 Subject: [PATCH 36/75] style: remove extraneous parens from if condition --- sklearn/tree/_criterion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index c95acf73cf60a..1e0655f103172 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1015,7 +1015,7 @@ cdef class MAE(RegressionCriterion): samples[start:end]""" cdef double* medians = NULL medians = calloc(self.n_outputs, sizeof(double)) - if (medians == NULL): + if medians == NULL: with gil: raise MemoryError() cdef double impurity = 0.0 From b9aef433c7682b5c65369cf2dd3fcdc3d9ce67c4 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 22 Jun 2016 07:48:20 -0700 Subject: [PATCH 37/75] implement median-heap method for calculating impurity --- sklearn/tree/_criterion.pxd | 4 - sklearn/tree/_criterion.pyx | 346 +++++++++++++++++++------ sklearn/tree/_utils.pxd | 52 +++- sklearn/tree/_utils.pyx | 487 +++++++++++++++++++++++++++--------- 4 files changed, 682 insertions(+), 207 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 7320fb184d5e8..889a623d732b3 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -47,10 +47,6 @@ cdef class Criterion: # where k is output index. cdef double* sum_left # Same as above, but for the left side of the split cdef double* sum_right # same as above, but for the right side of the split - # cdef DOUBLE_t* coupled_sorted_y # For MAE regression criteria, this stores the - # sorted y values - # cdef DOUBLE_t* coupled_sorted_weights # For MAE regression criteria, this stores the - # weights corresponding to the sorted y values # The criterion object is maintained such that left and right collected # statistics correspond to samples[start:pos] and samples[pos:end]. diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 1e0655f103172..929f6b7af4b0c 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -12,6 +12,7 @@ # Joel Nothman # Fares Hedayati # Jacob Schreiber +# Nelson Liu # # License: BSD 3 clause @@ -28,8 +29,8 @@ np.import_array() from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray -from ._utils cimport compute_weighted_median -from ._utils cimport sort_values_and_weights +from ._utils cimport MedianHeap +from ._utils cimport MinMaxHeapRecord cdef class Criterion: """Interface for impurity criteria. @@ -684,15 +685,7 @@ cdef class RegressionCriterion(Criterion): var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ - def __dealloc__(self): - """Destructor.""" - free(self.coupled_sorted_y) - free(self.coupled_sorted_weights) - cdef double sq_sum_total - cdef DOUBLE_t* coupled_sorted_y - cdef DOUBLE_t* coupled_sorted_weights - cdef bint initialized def __cinit__(self, SIZE_t n_outputs): """Initialize parameters for this criterion. @@ -703,7 +696,6 @@ cdef class RegressionCriterion(Criterion): The number of targets to be predicted """ - # Default values self.y = NULL self.y_stride = 0 self.sample_weight = NULL @@ -715,7 +707,6 @@ cdef class RegressionCriterion(Criterion): self.n_outputs = n_outputs self.n_node_samples = 0 - self.initialized = 0 self.weighted_n_node_samples = 0.0 self.weighted_n_left = 0.0 self.weighted_n_right = 0.0 @@ -727,8 +718,6 @@ cdef class RegressionCriterion(Criterion): self.sum_total = NULL self.sum_left = NULL self.sum_right = NULL - self.coupled_sorted_y = NULL - self.coupled_sorted_weights = NULL # Allocate memory for the accumulators self.sum_total = calloc(n_outputs, sizeof(double)) @@ -769,17 +758,6 @@ cdef class RegressionCriterion(Criterion): self.sq_sum_total = 0.0 memset(self.sum_total, 0, self.n_outputs * sizeof(double)) - if self.initialized == 0: - self.coupled_sorted_y = calloc(self.n_node_samples, - sizeof(DOUBLE_t)) - if sample_weight != NULL: - self.coupled_sorted_weights = calloc(self.n_node_samples, - sizeof(DOUBLE_t)) - if(self.coupled_sorted_y == NULL or - (self.coupled_sorted_weights == NULL and sample_weight != NULL)): - with gil: - raise MemoryError() - for p in range(start, end): i = samples[p] @@ -793,29 +771,12 @@ cdef class RegressionCriterion(Criterion): self.sq_sum_total += w_y_ik * y_ik self.weighted_n_node_samples += w - - if self.initialized == 0: - for p in range(start, end): - i = samples[p] - - if sample_weight != NULL: - w = sample_weight[i] - self.coupled_sorted_weights[p - start] = w - - for k in range(self.n_outputs): - y_ik = y[i * y_stride + k] - self.coupled_sorted_y[p - start] = y_ik - - sort_values_and_weights(self.coupled_sorted_y, self.coupled_sorted_weights, - 0, self.n_node_samples-1) - self.initialized = 1 - - # Reset to pos=start self.reset() cdef void reset(self) nogil: """Reset the criterion at pos=start.""" cdef SIZE_t n_bytes = self.n_outputs * sizeof(double) + memset(self.sum_left, 0, n_bytes) memcpy(self.sum_right, self.sum_total, n_bytes) @@ -823,6 +784,7 @@ cdef class RegressionCriterion(Criterion): self.weighted_n_right = self.weighted_n_node_samples self.pos = self.start + cdef void reverse_reset(self) nogil: """Reset the criterion at pos=end.""" cdef SIZE_t n_bytes = self.n_outputs * sizeof(double) @@ -850,6 +812,7 @@ cdef class RegressionCriterion(Criterion): cdef SIZE_t k cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik + cdef DOUBLE_t w_y_ik # Update statistics up to new_pos # @@ -868,7 +831,8 @@ cdef class RegressionCriterion(Criterion): for k in range(self.n_outputs): y_ik = y[i * self.y_stride + k] - sum_left[k] += w * y_ik + w_y_ik = w * y_ik + sum_left[k] += w_y_ik self.weighted_n_left += w else: @@ -882,7 +846,8 @@ cdef class RegressionCriterion(Criterion): for k in range(self.n_outputs): y_ik = y[i * self.y_stride + k] - sum_left[k] -= w * y_ik + w_y_ik = w * y_ik + sum_left[k] -= w_y_ik self.weighted_n_left -= w @@ -1001,33 +966,248 @@ cdef class MSE(RegressionCriterion): cdef class MAE(RegressionCriterion): """Mean absolute error impurity criterion""" + def __dealloc__(self): + """Destructor.""" + + free(self.node_medians) + + cdef np.ndarray left_child_heaps + cdef np.ndarray right_child_heaps + cdef double* node_medians + + def __cinit__(self, SIZE_t n_outputs): + """Initialize parameters for this criterion. + + Parameters + ---------- + n_outputs: SIZE_t + The number of targets to be predicted + """ + + # Default values + self.y = NULL + self.y_stride = 0 + self.sample_weight = NULL + + self.samples = NULL + self.start = 0 + self.pos = 0 + self.end = 0 + + self.n_outputs = n_outputs + self.n_node_samples = 0 + self.weighted_n_node_samples = 0.0 + self.weighted_n_left = 0.0 + self.weighted_n_right = 0.0 + + # Allocate accumulators. Make sure they are NULL, not uninitialized, + # before an exception can be raised (which triggers __dealloc__). + self.node_medians = NULL + + # Allocate memory for the accumulators + self.node_medians = calloc(n_outputs, sizeof(double)) + + if (self.node_medians == NULL): + raise MemoryError() + + self.left_child_heaps = np.empty(n_outputs, dtype='object') + self.right_child_heaps = np.empty(n_outputs, dtype='object') + + cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, + double weighted_n_samples, SIZE_t* samples, SIZE_t start, + SIZE_t end) nogil: + """Initialize the criterion at node samples[start:end] and + children samples[start:start] and samples[start:end].""" + + # Initialize fields + self.y = y + self.y_stride = y_stride + self.sample_weight = sample_weight + self.samples = samples + self.start = start + self.end = end + self.n_node_samples = end - start + self.weighted_n_samples = weighted_n_samples + self.weighted_n_node_samples = 0. + + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t y_ik + cdef DOUBLE_t w_y_ik + cdef DOUBLE_t w = 1.0 + + # Fill accumulators with MedianHeaps + with gil: + for k in range(self.n_outputs): + self.left_child_heaps[k] = MedianHeap(self.n_node_samples) + self.right_child_heaps[k] = MedianHeap(self.n_node_samples) + + cdef void** left_child_heaps = self.left_child_heaps.data + cdef void** right_child_heaps = self.right_child_heaps.data + + for p in range(start, end): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_outputs): + y_ik = y[i * y_stride + k] + w_y_ik = w * y_ik + + # push all values to the right side, + # since pos = start initially anyway + ( right_child_heaps[k]).push(w_y_ik) + + self.weighted_n_node_samples += w + # calculate the node medians + for k in range(self.n_outputs): + ( right_child_heaps[k]).get_median(&(self.node_medians[k])) + + # Reset to pos=start + self.reset() + + cdef void reset(self) nogil: + """Reset the criterion at pos=start.""" + + cdef SIZE_t i + cdef SIZE_t k + cdef DOUBLE_t popped + + cdef void** left_child_heaps = self.left_child_heaps.data + cdef void** right_child_heaps = self.right_child_heaps.data + + self.weighted_n_left = 0.0 + self.weighted_n_right = self.weighted_n_node_samples + self.pos = self.start + + # reset the medianheaps, left should have no elements and + # right should have all elements. + + for k in range(self.n_outputs): + # if left has no elements, it's already reset + for i in range(( left_child_heaps[k]).size()): + # remove everything from left and put it into right + ( left_child_heaps[k]).pop(&popped) + ( right_child_heaps[k]).push(popped) + + cdef void reverse_reset(self) nogil: + """Reset the criterion at pos=end.""" + + self.weighted_n_right = 0.0 + self.weighted_n_left = self.weighted_n_node_samples + self.pos = self.end + + cdef DOUBLE_t popped + cdef void** left_child_heaps = self.left_child_heaps.data + cdef void** right_child_heaps = self.right_child_heaps.data + + # reverse_reset the medianheaps, right should have no elements and + # left should have all elements. + for k in range(self.n_outputs): + # if right has no elements, it's already reset + for i in range(( right_child_heaps[k]).size()): + # remove everything from right and put it into left + ( right_child_heaps[k]).pop(&popped) + ( left_child_heaps[k]).push(popped) + + cdef void update(self, SIZE_t new_pos) nogil: + """Updated statistics by moving samples[pos:new_pos] to the left.""" + + cdef double* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + + cdef void** left_child_heaps = self.left_child_heaps.data + cdef void** right_child_heaps = self.right_child_heaps.data + + cdef DOUBLE_t* y = self.y + cdef SIZE_t pos = self.pos + cdef SIZE_t end = self.end + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t w = 1.0 + cdef DOUBLE_t y_ik + cdef DOUBLE_t w_y_ik + + # Update statistics up to new_pos + # + # We are going to update right_child_heaps and left_child_heaps + # from the direction that require the least amount of + # computations, i.e. from pos to new_pos or from end to new_pos. + + if (new_pos - pos) <= (end - new_pos): + for p in range(pos, new_pos): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_outputs): + y_ik = y[i * self.y_stride + k] + w_y_ik = w * y_ik + + # remove w_y_ik from right and add to left + ( right_child_heaps[k]).remove(w_y_ik) + ( left_child_heaps[k]).push(w_y_ik) + + self.weighted_n_left += w + else: + self.reverse_reset() + + for p in range(end - 1, new_pos - 1, -1): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_outputs): + y_ik = y[i * self.y_stride + k] + w_y_ik = w * y_ik + + # remove w_y_ik from left and add to right + ( left_child_heaps[k]).remove(w_y_ik) + ( right_child_heaps[k]).push(w_y_ik) + + self.weighted_n_left -= w + + self.weighted_n_right = (self.weighted_n_node_samples - + self.weighted_n_left) + self.pos = new_pos + cdef void node_value(self, double* dest) nogil: """Computes the node value of samples[start:end] into dest.""" - cdef SIZE_t start = self.start - cdef SIZE_t end = self.end - compute_weighted_median(dest, self.start, self.end, - self.coupled_sorted_weights, - self.coupled_sorted_y, - self.n_outputs) + + cdef SIZE_t k + for k in range(self.n_outputs): + dest[k] = self.node_medians[k] cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end]""" - cdef double* medians = NULL - medians = calloc(self.n_outputs, sizeof(double)) - if medians == NULL: - with gil: - raise MemoryError() - cdef double impurity = 0.0 + + cdef DOUBLE_t* y = self.y + cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t i, p, k + cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik - self.node_value(medians) + cdef DOUBLE_t w_y_ik + + cdef double impurity = 0.0 + for k in range(self.n_outputs): for p in range(self.start, self.end): - y_ik = self.coupled_sorted_y[p] - impurity += fabs(( y_ik) - medians[k]) - free(medians) + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + y_ik = y[i * self.y_stride + k] + w_y_ik = w * y_ik + + impurity += fabs(( w_y_ik) - self.node_medians[k]) return impurity / (self.weighted_n_node_samples * self.n_outputs) cdef void children_impurity(self, double* impurity_left, @@ -1036,7 +1216,7 @@ cdef class MAE(RegressionCriterion): left child (samples[start:pos]) and the impurity the right child (samples[pos:end]). """ - cdef DOUBLE_t* y = self.coupled_sorted_y + cdef DOUBLE_t* y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -1046,35 +1226,41 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i, p, k cdef DOUBLE_t y_ik + cdef DOUBLE_t w = 1.0 + cdef DOUBLE_t w_y_ik + cdef DOUBLE_t median - cdef double* medians = NULL - medians = calloc(self.n_outputs, sizeof(double)) - if (medians == NULL): - with gil: - raise MemoryError() + cdef void** left_child_heaps = self.left_child_heaps.data + cdef void** right_child_heaps = self.right_child_heaps.data - compute_weighted_median(medians, start, pos, - self.coupled_sorted_weights, - self.coupled_sorted_y, - self.n_outputs) impurity_left[0] = 0.0 impurity_right[0] = 0.0 for k in range(self.n_outputs): + ( left_child_heaps[k]).get_median(&median) for p in range(start, pos): - y_ik = y[p] - impurity_left[0] += fabs((y_ik) - medians[k]) + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + y_ik = y[i * self.y_stride + k] + w_y_ik = w * y_ik + impurity_left[0] += fabs(( w_y_ik) - median) impurity_left[0] /= ((pos - start) * self.n_outputs) - compute_weighted_median(medians, pos, end, - self.coupled_sorted_weights, - self.coupled_sorted_y, - self.n_outputs) + for k in range(self.n_outputs): + ( right_child_heaps[k]).get_median(&median) for p in range(pos, end): - y_ik = y[p] - impurity_right[0] += fabs((y_ik) - medians[k]) + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + y_ik = y[i * self.y_stride + k] + w_y_ik = w * y_ik + impurity_right[0] += fabs(( w_y_ik) - median) impurity_right[0] /= ((end - pos) * self.n_outputs) - free(medians) cdef class FriedmanMSE(MSE): diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index dbea4a9c60233..90482c4e11c01 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -39,14 +39,6 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except * cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size) -cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, - DOUBLE_t* coupled_sorted_weights, - DOUBLE_t* coupled_sorted_y, - SIZE_t n_outputs) nogil - -cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, - SIZE_t low, SIZE_t high) nogil - cdef SIZE_t rand_int(SIZE_t low, SIZE_t high, UINT32_t* random_state) nogil @@ -111,3 +103,47 @@ cdef class PriorityHeap: double impurity, double impurity_left, double impurity_right) nogil cdef int pop(self, PriorityHeapRecord* res) nogil + cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil + cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, + SIZE_t heap_length) nogil + +# ============================================================================= +# MinMaxHeap data structure +# ============================================================================= + +# A record stored in the MinMaxHeap +cdef struct MinMaxHeapRecord: + DOUBLE_t data + +cdef class MinMaxHeap: + cdef SIZE_t capacity + cdef SIZE_t heap_ptr + cdef MinMaxHeapRecord* heap_ + cdef bint mode + + cdef void heapify_up(self, MinMaxHeapRecord* heap, SIZE_t pos) nogil + cdef void heapify_down(self, MinMaxHeapRecord* heap, SIZE_t pos, + SIZE_t heap_length) nogil + cdef bint is_empty(self) nogil + cdef SIZE_t size(self) nogil + cdef int push(self, DOUBLE_t data) nogil + cdef int remove(self, DOUBLE_t value) nogil + cdef int pop(self, DOUBLE_t* res) nogil + cdef int peek(self, DOUBLE_t* res) nogil + +# ============================================================================= +# MedianHeap data structure +# ============================================================================= + +cdef class MedianHeap: + cdef SIZE_t initial_capacity + cdef SIZE_t current_capacity + cdef MinMaxHeap right_min_heap + cdef MinMaxHeap left_max_heap + + cdef SIZE_t size(self) nogil + cdef int pop(self, DOUBLE_t* res) nogil + cdef int push(self, DOUBLE_t data) nogil + cdef int remove(self, DOUBLE_t data) nogil + cdef int get_median(self, DOUBLE_t* data) nogil + cdef int rebalance(self) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 616ad1c1652b7..b1a01a289d5d3 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -6,12 +6,14 @@ # Peter Prettenhofer # Arnaud Joly # Jacob Schreiber +# Nelson Liu # # # License: BSD 3 clause from libc.stdlib cimport free from libc.stdlib cimport malloc +from libc.stdlib cimport calloc from libc.stdlib cimport realloc from libc.math cimport log as ln @@ -49,85 +51,6 @@ def _realloc_test(): assert False -cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end, - DOUBLE_t* coupled_sorted_weights, - DOUBLE_t* coupled_sorted_y, - SIZE_t n_outputs) nogil: - """Calculate the weighted median of samples[start:end] and put - it into a destination pointer - given values, weights, and a start and end index. - """ - cdef DOUBLE_t w = 1.0 - cdef SIZE_t p, k - cdef DOUBLE_t sum_weights - cdef SIZE_t median_index - cdef DOUBLE_t running_sum - cdef SIZE_t n_node_samples = end - start - - for k in range(n_outputs): - median_index = 0 - sum_weights = 0.0 - - if coupled_sorted_weights != NULL: - # calculate the weighted median - for p in range(n_node_samples): - sum_weights += coupled_sorted_weights[p] - - running_sum = sum_weights - coupled_sorted_weights[0] - - while(running_sum > sum_weights/2): - median_index += 1 - running_sum -= coupled_sorted_weights[median_index] - - if running_sum == sum_weights/2: - median_dest[k] = (coupled_sorted_y[median_index + start] + - coupled_sorted_y[median_index + 1 + start]) / 2.0 - else: - median_dest[k] = coupled_sorted_y[median_index + start] - else: - # calculate the unweighted median - if (n_node_samples) % 2 == 0: - median_dest[k] = (coupled_sorted_y[(n_node_samples / 2) + start] + - coupled_sorted_y[(n_node_samples / 2) - 1 + start]) / 2.0 - else: - median_dest[k] = coupled_sorted_y[(n_node_samples / 2) + start] - - -cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights, - SIZE_t low, SIZE_t high) nogil: - """Sort an array and its weights""" - cdef SIZE_t pivot, i, j, - cdef DOUBLE_t temp - if low < high: - pivot = low - i = low - j = high - while i < j: - while(y_vals[i] <= y_vals[pivot] and i <= high): - i += 1 - while(y_vals[j] > y_vals[pivot] and j >= low): - j -= 1 - if i < j: - temp = y_vals[i] - y_vals[i] = y_vals[j] - y_vals[j] = temp - if weights != NULL: - temp = weights[i] - weights[i] = weights[j] - weights[j] = temp - temp = y_vals[j] - y_vals[j] = y_vals[pivot] - y_vals[pivot] = temp - - if weights != NULL: - temp = weights[j] - weights[j] = weights[pivot] - weights[pivot] = temp - - sort_values_and_weights(y_vals, weights, low, j-1) - sort_values_and_weights(y_vals, weights, j+1, high) - - # rand_r replacement using a 32bit XorShift generator # See http://www.jstatsoft.org/v08/i14/paper for details cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil: @@ -250,40 +173,6 @@ cdef class Stack: # PriorityHeap data structure # ============================================================================= -cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil: - """Restore heap invariant parent.improvement > child.improvement from - ``pos`` upwards. """ - if pos == 0: - return - - cdef SIZE_t parent_pos = (pos - 1) / 2 - - if heap[parent_pos].improvement < heap[pos].improvement: - heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] - heapify_up(heap, parent_pos) - - -cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos, - SIZE_t heap_length) nogil: - """Restore heap invariant parent.improvement > children.improvement from - ``pos`` downwards. """ - cdef SIZE_t left_pos = 2 * (pos + 1) - 1 - cdef SIZE_t right_pos = 2 * (pos + 1) - cdef SIZE_t largest = pos - - if (left_pos < heap_length and - heap[left_pos].improvement > heap[largest].improvement): - largest = left_pos - - if (right_pos < heap_length and - heap[right_pos].improvement > heap[largest].improvement): - largest = right_pos - - if largest != pos: - heap[pos], heap[largest] = heap[largest], heap[pos] - heapify_down(heap, largest, heap_length) - - cdef class PriorityHeap: """A priority queue implemented as a binary heap. @@ -314,6 +203,38 @@ cdef class PriorityHeap: def __dealloc__(self): free(self.heap_) + cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil: + """Restore heap invariant parent.improvement > child.improvement from + ``pos`` upwards. """ + if pos == 0: + return + + cdef SIZE_t parent_pos = (pos - 1) / 2 + + if heap[parent_pos].improvement < heap[pos].improvement: + heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] + self.heapify_up(heap, parent_pos) + + cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, + SIZE_t heap_length) nogil: + """Restore heap invariant parent.improvement > children.improvement from + ``pos`` downwards. """ + cdef SIZE_t left_pos = 2 * (pos + 1) - 1 + cdef SIZE_t right_pos = 2 * (pos + 1) + cdef SIZE_t largest = pos + + if (left_pos < heap_length and + heap[left_pos].improvement > heap[largest].improvement): + largest = left_pos + + if (right_pos < heap_length and + heap[right_pos].improvement > heap[largest].improvement): + largest = right_pos + + if largest != pos: + heap[pos], heap[largest] = heap[largest], heap[pos] + self.heapify_down(heap, largest, heap_length) + cdef bint is_empty(self) nogil: return self.heap_ptr <= 0 @@ -353,7 +274,7 @@ cdef class PriorityHeap: heap[heap_ptr].improvement = improvement # Heapify up - heapify_up(heap, heap_ptr) + self.heapify_up(heap, heap_ptr) # Increase element count self.heap_ptr = heap_ptr + 1 @@ -375,8 +296,344 @@ cdef class PriorityHeap: # Restore heap invariant if heap_ptr > 1: - heapify_down(heap, 0, heap_ptr - 1) + self.heapify_down(heap, 0, heap_ptr - 1) + + self.heap_ptr = heap_ptr - 1 + + return 0 + +# ============================================================================= +# MinMaxHeap data structure +# ============================================================================= + +cdef class MinMaxHeap: + """A priority queue implemented as a binary heap. + + The heap invariant is that the impurity improvement of the parent record + is larger then the impurity improvement of the children. + + Attributes + ---------- + capacity : SIZE_t + The capacity of the heap + + heap_ptr : SIZE_t + The water mark of the heap; the heap grows from left to right in the + array ``heap_``. heap_ptr is always less than capacity. + + heap_ : MinMaxHeapRecord* + The array of heap records. The maximum element is on the left; + the heap grows from left to right + + mode : bint + The mode of the heap. When the value of the ``mode`` parameter passed + in at construction is ``max``, the heap is a Max-Heap and mode is set + to 1. When the value of the ``mode`` parameter passed in at + construction is not ``max``, the heap is a Min-Heap and mode is set + to 0. + """ + + def __cinit__(self, SIZE_t capacity, str mode): + self.capacity = capacity + if mode == "max": + self.mode = 1 + else: + self.mode = 0 + + self.heap_ptr = 0 + + self.heap_ = calloc(capacity, sizeof(MinMaxHeapRecord)) + if self.heap_ == NULL: + raise MemoryError() + + def __dealloc__(self): + free(self.heap_) + + cdef void heapify_up(self, MinMaxHeapRecord* heap, SIZE_t pos) nogil: + """Restore heap invariant from + ``pos`` upwards. """ + if pos == 0: + return + + cdef SIZE_t parent_pos = (pos - 1) / 2 + + if self.mode == 1: + if heap[parent_pos].data < heap[pos].data: + heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] + self.heapify_up(heap, parent_pos) + else: + if heap[parent_pos].data > heap[pos].data: + heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] + self.heapify_up(heap, parent_pos) + + cdef void heapify_down(self, MinMaxHeapRecord* heap, SIZE_t pos, + SIZE_t heap_length) nogil: + """Restore heap invariant from + ``pos`` downwards. """ + cdef SIZE_t left_pos = 2 * (pos + 1) - 1 + cdef SIZE_t right_pos = 2 * (pos + 1) + cdef SIZE_t candidate = pos + + if self.mode == 1: + if (left_pos < heap_length and + heap[left_pos].data > heap[candidate].data): + candidate = left_pos + + if (right_pos < heap_length and + heap[right_pos].data > heap[candidate].data): + candidate = right_pos + else: + if (left_pos < heap_length and + heap[left_pos].data < heap[candidate].data): + candidate = left_pos + + if (right_pos < heap_length and + heap[right_pos].data < heap[candidate].data): + candidate = right_pos + if candidate != pos: + heap[pos], heap[candidate] = heap[candidate], heap[pos] + self.heapify_down(heap, candidate, heap_length) + + cdef bint is_empty(self) nogil: + return self.heap_ptr <= 0 + + cdef SIZE_t size(self) nogil: + return self.heap_ptr + + cdef int push(self, DOUBLE_t data) nogil: + """Push record on the priority heap. + + Returns 0 if successful; -1 on out of memory error. + """ + cdef SIZE_t heap_ptr = self.heap_ptr + cdef MinMaxHeapRecord* heap = NULL + + # Resize if capacity not sufficient + if heap_ptr >= self.capacity: + self.capacity *= 2 + heap = realloc(self.heap_, + self.capacity * + sizeof(MinMaxHeapRecord)) + if heap == NULL: + # no free; __dealloc__ handles that + return -1 + self.heap_ = heap + + # Put element as last element of heap + heap = self.heap_ + heap[heap_ptr].data = data + + # Heapify up + self.heapify_up(heap, heap_ptr) + + # Increase element count + self.heap_ptr = heap_ptr + 1 + return 0 + + cdef int remove(self, DOUBLE_t value) nogil: + """Remove a specific value from heap""" + cdef SIZE_t heap_ptr = self.heap_ptr + cdef MinMaxHeapRecord* heap = self.heap_ + cdef SIZE_t idx_to_remove = -1 + cdef SIZE_t i + + if heap_ptr <= 0: + return -1 + + # find element to remove + for i in range(0, heap_ptr): + if heap[i].data == value: + idx_to_remove = i + break + # should we throw an error if the element isn't found? + # it shouldn't happen, but better to fail noisily...? + + # put the last element where we want to remove + heap[i], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[i] + + # Restore heap invariant + if heap_ptr > 1: + self.heapify_down(heap, 0, heap_ptr - 1) + + self.heap_ptr = heap_ptr - 1 + + return 0 + + + cdef int pop(self, DOUBLE_t* res) nogil: + """Remove top element from heap.""" + cdef SIZE_t heap_ptr = self.heap_ptr + cdef MinMaxHeapRecord* heap = self.heap_ + + if heap_ptr <= 0: + return -1 + + # Take first element + res[0] = heap[0].data + + # Put last element to the front + heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0] + + # Restore heap invariant + if heap_ptr > 1: + self.heapify_down(heap, 0, heap_ptr - 1) self.heap_ptr = heap_ptr - 1 return 0 + + cdef int peek(self, DOUBLE_t* res) nogil: + """Write the top element from heap to a pointer.""" + cdef SIZE_t heap_ptr = self.heap_ptr + cdef MinMaxHeapRecord* heap = self.heap_ + if heap_ptr <= 0: + return -1 + # Take first value + res[0] = heap[0].data + return 0 + +# ============================================================================= +# MedianHeap data structure +# ============================================================================= + +cdef class MedianHeap: + + def __cinit__(self, SIZE_t initial_capacity): + self.initial_capacity = initial_capacity + self.current_capacity = 0 + self.left_max_heap = MinMaxHeap(initial_capacity, "max") + self.right_min_heap = MinMaxHeap(initial_capacity, "min") + + cdef SIZE_t size(self) nogil: + return self.current_capacity + + cdef int push(self, DOUBLE_t data) nogil: + """Push a value to the MedianHeap to be considered + in the median calculation + """ + cdef double current_median + cdef int return_value + + if self.current_capacity == 0: + return_value = self.left_max_heap.push(data) + else: + self.get_median(¤t_median) + if current_median <= data: + # data is greater than or equal to current median, so it goes on min heap + return_value = self.right_min_heap.push(data) + else: + # data is less than current median, so it goes on max heap + return_value = self.left_max_heap.push(data) + self.rebalance() + self.current_capacity += 1 + return return_value + + cdef int remove(self, DOUBLE_t data) nogil: + """Remove a value from the MedianHeap, removing it + from consideration in the median calculation + """ + cdef double current_median + cdef int return_value + + self.get_median(¤t_median) + if current_median == data: + # data is the same value as current median, it is in + # the bigger one + if self.right_min_heap.size() > self.left_max_heap.size(): + # it is in the right + return_value = self.right_min_heap.remove(data) + else: + # it is in the left + return_value = self.left_max_heap.remove(data) + elif current_median < data: + # data is greater than or equal to current median, so it is on min heap + return_value = self.right_min_heap.remove(data) + else: + # data is less than current median, so it is on max heap + return_value = self.left_max_heap.remove(data) + self.rebalance() + self.current_capacity -= 1 + return return_value + + cdef int pop(self, DOUBLE_t* res) nogil: + """Pop a value from the MedianHeap, starting from the + left and moving to the right. + """ + cdef int return_value + + # no elements to pop + if self.current_capacity == 0: + return -1 + + if self.left_max_heap.size() != 0: + # pop from the left + return_value = self.left_max_heap.pop(res) + elif self.right_min_heap.size() != 0: + # pop from right + return_value = self.right_min_heap.pop(res) + else: + return -1 + self.rebalance() + self.current_capacity -= 1 + return return_value + + cdef int get_median(self, double* data) nogil: + """Return the current median""" + if self.current_capacity == 0: + return -1 + + cdef SIZE_t left_max_heap_size = self.left_max_heap.size() + cdef SIZE_t right_min_heap_size = self.right_min_heap.size() + cdef DOUBLE_t left_max_heap_median + cdef DOUBLE_t right_min_heap_median + + if self.current_capacity < 2: + # there is only one thing, so set the median to be that + if left_max_heap_size >= 1: + self.left_max_heap.peek(&left_max_heap_median) + data[0] = left_max_heap_median + else: + self.right_min_heap.peek(&right_min_heap_median) + data[0] = right_min_heap_median + return 0 + self.left_max_heap.peek(&left_max_heap_median) + self.right_min_heap.peek(&right_min_heap_median) + + if left_max_heap_size == right_min_heap_size: + # take the average of the two + data[0] = (left_max_heap_median + + right_min_heap_median) / 2.0 + elif left_max_heap_size > right_min_heap_size: + # left max heap larger, so median is at its' top + data[0] = left_max_heap_median + else: + # right min heap is larger, so median is at its' top + data[0] = right_min_heap_median + return 0 + + cdef int rebalance(self) nogil: + """Rebalance the left max heap and the left min heap to have a + one element or less difference in size""" + cdef SIZE_t left_max_heap_size = self.left_max_heap.size() + cdef SIZE_t right_min_heap_size = self.right_min_heap.size() + cdef SIZE_t size_difference = left_max_heap_size - right_min_heap_size + cdef DOUBLE_t popped + cdef SIZE_t i + + if size_difference >= -1 and size_difference <= 1: + # no balancing needed + return 0 + + if size_difference > 1: + # left max heap bigger + for i in range(0, size_difference - 1): + # pop from left max heap and push into right min heap + self.left_max_heap.pop(&popped) + self.right_min_heap.push(popped) + else: + # right min heap bigger + for i in range(0, (size_difference * -1) - 1): + # pop from right min heap and push into left max heap + self.right_min_heap.pop(&popped) + self.left_max_heap.push(popped) + return 0 From 39e693ca86ba4f1c222ad605a5238492a9743fa5 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 22 Jun 2016 08:35:38 -0700 Subject: [PATCH 38/75] style: remove extra line --- sklearn/tree/_criterion.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 929f6b7af4b0c..e2679fe004b19 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -797,6 +797,7 @@ cdef class RegressionCriterion(Criterion): cdef void update(self, SIZE_t new_pos) nogil: """Updated statistics by moving samples[pos:new_pos] to the left.""" + cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right cdef double* sum_total = self.sum_total @@ -890,6 +891,7 @@ cdef class MSE(RegressionCriterion): impurity = self.sq_sum_total / self.weighted_n_node_samples for k in range(self.n_outputs): impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0 + return impurity / self.n_outputs cdef double proxy_impurity_improvement(self) nogil: @@ -914,6 +916,7 @@ cdef class MSE(RegressionCriterion): for k in range(self.n_outputs): proxy_impurity_left += sum_left[k] * sum_left[k] proxy_impurity_right += sum_right[k] * sum_right[k] + return (proxy_impurity_left / self.weighted_n_left + proxy_impurity_right / self.weighted_n_right) @@ -923,7 +926,6 @@ cdef class MSE(RegressionCriterion): left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - cdef DOUBLE_t* y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples From 20d6107c7ebfba1b96d483cfacb97a9ac304b3cb Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 23 Jun 2016 12:13:03 +0800 Subject: [PATCH 39/75] style: fix inadvertent cosmetic changes; i'll address some of these in a separate PR --- sklearn/tree/_criterion.pyx | 7 +++++-- sklearn/tree/_utils.pxd | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index e2679fe004b19..3278eb0a2bec7 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -685,6 +685,7 @@ cdef class RegressionCriterion(Criterion): var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ + cdef double sq_sum_total def __cinit__(self, SIZE_t n_outputs): @@ -696,6 +697,7 @@ cdef class RegressionCriterion(Criterion): The number of targets to be predicted """ + # Default values self.y = NULL self.y_stride = 0 self.sample_weight = NULL @@ -771,12 +773,13 @@ cdef class RegressionCriterion(Criterion): self.sq_sum_total += w_y_ik * y_ik self.weighted_n_node_samples += w + + # Reset to pos=start self.reset() cdef void reset(self) nogil: """Reset the criterion at pos=start.""" cdef SIZE_t n_bytes = self.n_outputs * sizeof(double) - memset(self.sum_left, 0, n_bytes) memcpy(self.sum_right, self.sum_total, n_bytes) @@ -784,7 +787,6 @@ cdef class RegressionCriterion(Criterion): self.weighted_n_right = self.weighted_n_node_samples self.pos = self.start - cdef void reverse_reset(self) nogil: """Reset the criterion at pos=end.""" cdef SIZE_t n_bytes = self.n_outputs * sizeof(double) @@ -926,6 +928,7 @@ cdef class MSE(RegressionCriterion): left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" + cdef DOUBLE_t* y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 90482c4e11c01..ada1620e9820b 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -39,6 +39,7 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except * cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size) + cdef SIZE_t rand_int(SIZE_t low, SIZE_t high, UINT32_t* random_state) nogil From f73ac8e487301a6138d8013bf4ea185aa5cb92ae Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Fri, 8 Jul 2016 10:15:08 -0700 Subject: [PATCH 40/75] feature: change minmaxheap to internally use sorted arrays --- sklearn/tree/_utils.pxd | 3 -- sklearn/tree/_utils.pyx | 91 ++++++++++++----------------------------- 2 files changed, 26 insertions(+), 68 deletions(-) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index ada1620e9820b..f688e8340c605 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -122,9 +122,6 @@ cdef class MinMaxHeap: cdef MinMaxHeapRecord* heap_ cdef bint mode - cdef void heapify_up(self, MinMaxHeapRecord* heap, SIZE_t pos) nogil - cdef void heapify_down(self, MinMaxHeapRecord* heap, SIZE_t pos, - SIZE_t heap_length) nogil cdef bint is_empty(self) nogil cdef SIZE_t size(self) nogil cdef int push(self, DOUBLE_t data) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index b1a01a289d5d3..d54815279ce99 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -309,8 +309,10 @@ cdef class PriorityHeap: cdef class MinMaxHeap: """A priority queue implemented as a binary heap. - The heap invariant is that the impurity improvement of the parent record - is larger then the impurity improvement of the children. + The heap invariant is that the impurity improvement of the parent record is + larger then the impurity improvement of the children. The MinHeap is + essentially an array sorted in ascending order, and a MaxHeap is an array + sorted in descending order. Attributes ---------- @@ -349,51 +351,6 @@ cdef class MinMaxHeap: def __dealloc__(self): free(self.heap_) - cdef void heapify_up(self, MinMaxHeapRecord* heap, SIZE_t pos) nogil: - """Restore heap invariant from - ``pos`` upwards. """ - if pos == 0: - return - - cdef SIZE_t parent_pos = (pos - 1) / 2 - - if self.mode == 1: - if heap[parent_pos].data < heap[pos].data: - heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] - self.heapify_up(heap, parent_pos) - else: - if heap[parent_pos].data > heap[pos].data: - heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] - self.heapify_up(heap, parent_pos) - - cdef void heapify_down(self, MinMaxHeapRecord* heap, SIZE_t pos, - SIZE_t heap_length) nogil: - """Restore heap invariant from - ``pos`` downwards. """ - cdef SIZE_t left_pos = 2 * (pos + 1) - 1 - cdef SIZE_t right_pos = 2 * (pos + 1) - cdef SIZE_t candidate = pos - - if self.mode == 1: - if (left_pos < heap_length and - heap[left_pos].data > heap[candidate].data): - candidate = left_pos - - if (right_pos < heap_length and - heap[right_pos].data > heap[candidate].data): - candidate = right_pos - else: - if (left_pos < heap_length and - heap[left_pos].data < heap[candidate].data): - candidate = left_pos - - if (right_pos < heap_length and - heap[right_pos].data < heap[candidate].data): - candidate = right_pos - if candidate != pos: - heap[pos], heap[candidate] = heap[candidate], heap[pos] - self.heapify_down(heap, candidate, heap_length) - cdef bint is_empty(self) nogil: return self.heap_ptr <= 0 @@ -406,6 +363,7 @@ cdef class MinMaxHeap: Returns 0 if successful; -1 on out of memory error. """ cdef SIZE_t heap_ptr = self.heap_ptr + cdef SIZE_t i cdef MinMaxHeapRecord* heap = NULL # Resize if capacity not sufficient @@ -423,8 +381,19 @@ cdef class MinMaxHeap: heap = self.heap_ heap[heap_ptr].data = data - # Heapify up - self.heapify_up(heap, heap_ptr) + # bubble last element up according to mode + # max heap, sorted in descending order + i = heap_ptr + if self.mode == 1: + while(i != 0 and heap[i].data > heap[i-1].data): + heap[i], heap[i-1] = heap[i-1], heap[i] + i = i-1 + + # min heap, sorted in ascending order + else: + while(i != 0 and heap[i].data < heap[i-1].data): + heap[i], heap[i-1] = heap[i-1], heap[i] + i = i-1 # Increase element count self.heap_ptr = heap_ptr + 1 @@ -441,29 +410,25 @@ cdef class MinMaxHeap: return -1 # find element to remove - for i in range(0, heap_ptr): + for i in range(heap_ptr): if heap[i].data == value: idx_to_remove = i break # should we throw an error if the element isn't found? # it shouldn't happen, but better to fail noisily...? - # put the last element where we want to remove - heap[i], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[i] - - # Restore heap invariant - if heap_ptr > 1: - self.heapify_down(heap, 0, heap_ptr - 1) + # move after the removed element over by one + for i in range(idx_to_remove, heap_ptr-1): + heap[i] = heap[i+1] self.heap_ptr = heap_ptr - 1 - return 0 - cdef int pop(self, DOUBLE_t* res) nogil: """Remove top element from heap.""" cdef SIZE_t heap_ptr = self.heap_ptr cdef MinMaxHeapRecord* heap = self.heap_ + cdef SIZE_t i if heap_ptr <= 0: return -1 @@ -471,15 +436,11 @@ cdef class MinMaxHeap: # Take first element res[0] = heap[0].data - # Put last element to the front - heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0] - - # Restore heap invariant - if heap_ptr > 1: - self.heapify_down(heap, 0, heap_ptr - 1) + # move after the removed element over by one + for i in range(0, heap_ptr-1): + heap[i] = heap[i+1] self.heap_ptr = heap_ptr - 1 - return 0 cdef int peek(self, DOUBLE_t* res) nogil: From 5b8d665c2968c54cd1c07c237ddf0a1438b4cd4c Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sat, 16 Jul 2016 13:02:39 -0500 Subject: [PATCH 41/75] refactored MAE and push to share work --- sklearn/tree/_criterion.pyx | 84 +++----- sklearn/tree/_utils.pxd | 45 ++-- sklearn/tree/_utils.pyx | 406 ++++++++++++++++++++---------------- 3 files changed, 281 insertions(+), 254 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 3278eb0a2bec7..15372f686852a 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -29,8 +29,7 @@ np.import_array() from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray -from ._utils cimport MedianHeap -from ._utils cimport MinMaxHeapRecord +from ._utils cimport WeightedMedianHeap cdef class Criterion: """Interface for impurity criteria. @@ -964,7 +963,7 @@ cdef class MSE(RegressionCriterion): for k in range(self.n_outputs): impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 impurity_left[0] /= self.n_outputs impurity_right[0] /= self.n_outputs @@ -973,7 +972,6 @@ cdef class MAE(RegressionCriterion): """Mean absolute error impurity criterion""" def __dealloc__(self): """Destructor.""" - free(self.node_medians) cdef np.ndarray left_child_heaps @@ -1039,14 +1037,13 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t p cdef SIZE_t k cdef DOUBLE_t y_ik - cdef DOUBLE_t w_y_ik cdef DOUBLE_t w = 1.0 # Fill accumulators with MedianHeaps with gil: for k in range(self.n_outputs): - self.left_child_heaps[k] = MedianHeap(self.n_node_samples) - self.right_child_heaps[k] = MedianHeap(self.n_node_samples) + self.left_child_heaps[k] = WeightedMedianHeap(self.n_node_samples) + self.right_child_heaps[k] = WeightedMedianHeap(self.n_node_samples) cdef void** left_child_heaps = self.left_child_heaps.data cdef void** right_child_heaps = self.right_child_heaps.data @@ -1059,16 +1056,16 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): y_ik = y[i * y_stride + k] - w_y_ik = w * y_ik # push all values to the right side, # since pos = start initially anyway - ( right_child_heaps[k]).push(w_y_ik) + ( right_child_heaps[k]).push(y_ik, w) self.weighted_n_node_samples += w + # calculate the node medians for k in range(self.n_outputs): - ( right_child_heaps[k]).get_median(&(self.node_medians[k])) + ( right_child_heaps[k]).get_median(&(self.node_medians[k])) # Reset to pos=start self.reset() @@ -1078,7 +1075,8 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i cdef SIZE_t k - cdef DOUBLE_t popped + cdef DOUBLE_t popped_value + cdef DOUBLE_t popped_weight cdef void** left_child_heaps = self.left_child_heaps.data cdef void** right_child_heaps = self.right_child_heaps.data @@ -1092,10 +1090,12 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): # if left has no elements, it's already reset - for i in range(( left_child_heaps[k]).size()): + for i in range(( left_child_heaps[k]).size()): # remove everything from left and put it into right - ( left_child_heaps[k]).pop(&popped) - ( right_child_heaps[k]).push(popped) + ( left_child_heaps[k]).pop(&popped_value, + &popped_weight) + ( right_child_heaps[k]).push(popped_value, + popped_weight) cdef void reverse_reset(self) nogil: """Reset the criterion at pos=end.""" @@ -1104,7 +1104,8 @@ cdef class MAE(RegressionCriterion): self.weighted_n_left = self.weighted_n_node_samples self.pos = self.end - cdef DOUBLE_t popped + cdef DOUBLE_t popped_value + cdef DOUBLE_t popped_weight cdef void** left_child_heaps = self.left_child_heaps.data cdef void** right_child_heaps = self.right_child_heaps.data @@ -1112,10 +1113,12 @@ cdef class MAE(RegressionCriterion): # left should have all elements. for k in range(self.n_outputs): # if right has no elements, it's already reset - for i in range(( right_child_heaps[k]).size()): + for i in range(( right_child_heaps[k]).size()): # remove everything from right and put it into left - ( right_child_heaps[k]).pop(&popped) - ( left_child_heaps[k]).push(popped) + ( right_child_heaps[k]).pop(&popped_value, + &popped_weight) + ( left_child_heaps[k]).push(popped_value, + popped_weight) cdef void update(self, SIZE_t new_pos) nogil: """Updated statistics by moving samples[pos:new_pos] to the left.""" @@ -1134,7 +1137,6 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t k cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik - cdef DOUBLE_t w_y_ik # Update statistics up to new_pos # @@ -1151,11 +1153,9 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): y_ik = y[i * self.y_stride + k] - w_y_ik = w * y_ik - - # remove w_y_ik from right and add to left - ( right_child_heaps[k]).remove(w_y_ik) - ( left_child_heaps[k]).push(w_y_ik) + # remove y_ik with weight w from right and add to left + ( right_child_heaps[k]).remove(y_ik, w) + ( left_child_heaps[k]).push(y_ik, w) self.weighted_n_left += w else: @@ -1169,15 +1169,13 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): y_ik = y[i * self.y_stride + k] - w_y_ik = w * y_ik - - # remove w_y_ik from left and add to right - ( left_child_heaps[k]).remove(w_y_ik) - ( right_child_heaps[k]).push(w_y_ik) + # remove y_ik from left and add to right + ( left_child_heaps[k]).remove(y_ik, w) + ( right_child_heaps[k]).push(y_ik, w) self.weighted_n_left -= w - self.weighted_n_right = (self.weighted_n_node_samples - + self.weighted_n_right = (self.weighted_n_node_samples - self.weighted_n_left) self.pos = new_pos @@ -1196,7 +1194,6 @@ cdef class MAE(RegressionCriterion): cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t i, p, k - cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik cdef DOUBLE_t w_y_ik @@ -1206,13 +1203,9 @@ cdef class MAE(RegressionCriterion): for p in range(self.start, self.end): i = samples[p] - if sample_weight != NULL: - w = sample_weight[i] - y_ik = y[i * self.y_stride + k] - w_y_ik = w * y_ik - impurity += fabs(( w_y_ik) - self.node_medians[k]) + impurity += fabs(( y_ik) - self.node_medians[k]) return impurity / (self.weighted_n_node_samples * self.n_outputs) cdef void children_impurity(self, double* impurity_left, @@ -1231,8 +1224,6 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i, p, k cdef DOUBLE_t y_ik - cdef DOUBLE_t w = 1.0 - cdef DOUBLE_t w_y_ik cdef DOUBLE_t median cdef void** left_child_heaps = self.left_child_heaps.data @@ -1242,29 +1233,22 @@ cdef class MAE(RegressionCriterion): impurity_right[0] = 0.0 for k in range(self.n_outputs): - ( left_child_heaps[k]).get_median(&median) + ( left_child_heaps[k]).get_median(&median) for p in range(start, pos): i = samples[p] - if sample_weight != NULL: - w = sample_weight[i] - y_ik = y[i * self.y_stride + k] - w_y_ik = w * y_ik - impurity_left[0] += fabs(( w_y_ik) - median) + + impurity_left[0] += fabs(( y_ik) - median) impurity_left[0] /= ((pos - start) * self.n_outputs) for k in range(self.n_outputs): - ( right_child_heaps[k]).get_median(&median) + ( right_child_heaps[k]).get_median(&median) for p in range(pos, end): i = samples[p] - if sample_weight != NULL: - w = sample_weight[i] - y_ik = y[i * self.y_stride + k] - w_y_ik = w * y_ik - impurity_right[0] += fabs(( w_y_ik) - median) + impurity_right[0] += fabs(( y_ik) - median) impurity_right[0] /= ((end - pos) * self.n_outputs) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index f688e8340c605..9d5b8a725b371 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -109,39 +109,46 @@ cdef class PriorityHeap: SIZE_t heap_length) nogil # ============================================================================= -# MinMaxHeap data structure +# WeightedPQueue data structure # ============================================================================= -# A record stored in the MinMaxHeap -cdef struct MinMaxHeapRecord: +# A record stored in the WeightedPQueue +cdef struct WeightedPQueueRecord: DOUBLE_t data + DOUBLE_t weight -cdef class MinMaxHeap: +cdef class WeightedPQueue: cdef SIZE_t capacity - cdef SIZE_t heap_ptr - cdef MinMaxHeapRecord* heap_ - cdef bint mode + cdef SIZE_t array_ptr + cdef WeightedPQueueRecord* array_ cdef bint is_empty(self) nogil cdef SIZE_t size(self) nogil - cdef int push(self, DOUBLE_t data) nogil - cdef int remove(self, DOUBLE_t value) nogil - cdef int pop(self, DOUBLE_t* res) nogil - cdef int peek(self, DOUBLE_t* res) nogil + cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil + cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil + cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil + cdef int peek(self, DOUBLE_t* res, DOUBLE_t* weight) nogil + cdef int get_index_data(self, SIZE_t idx, DOUBLE_t* value, DOUBLE_t* weight) nogil # ============================================================================= # MedianHeap data structure # ============================================================================= -cdef class MedianHeap: +cdef class WeightedMedianHeap: cdef SIZE_t initial_capacity cdef SIZE_t current_capacity - cdef MinMaxHeap right_min_heap - cdef MinMaxHeap left_max_heap + cdef WeightedPQueue samples + cdef DOUBLE_t total_weight + cdef SIZE_t k + cdef DOUBLE_t sum_w_0_k # represents sum(weights[0:k]) + # = w[0] + w[1] + ... + w[k-1] cdef SIZE_t size(self) nogil - cdef int pop(self, DOUBLE_t* res) nogil - cdef int push(self, DOUBLE_t data) nogil - cdef int remove(self, DOUBLE_t data) nogil - cdef int get_median(self, DOUBLE_t* data) nogil - cdef int rebalance(self) nogil + cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil + cdef int update_median_parameters_post_push(self, DOUBLE_t data, DOUBLE_t weight) nogil + cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil + cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil + cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil + cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil + cdef int update_median_parameters_post_remove(self, DOUBLE_t data, DOUBLE_t weight) nogil + cdef int get_median(self, double* median) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index d54815279ce99..17e9361fc8f6c 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -303,220 +303,263 @@ cdef class PriorityHeap: return 0 # ============================================================================= -# MinMaxHeap data structure +# WeightedPQueue data structure # ============================================================================= -cdef class MinMaxHeap: - """A priority queue implemented as a binary heap. - - The heap invariant is that the impurity improvement of the parent record is - larger then the impurity improvement of the children. The MinHeap is - essentially an array sorted in ascending order, and a MaxHeap is an array - sorted in descending order. +cdef class WeightedPQueue: + """A priority queue class, always sorted in increasing order. Attributes ---------- capacity : SIZE_t - The capacity of the heap + The capacity of the array - heap_ptr : SIZE_t - The water mark of the heap; the heap grows from left to right in the - array ``heap_``. heap_ptr is always less than capacity. + array_ptr : SIZE_t + The water mark of the array; the array grows from left to right in the + array ``array_``. array_ptr is always less than capacity. - heap_ : MinMaxHeapRecord* - The array of heap records. The maximum element is on the left; - the heap grows from left to right - - mode : bint - The mode of the heap. When the value of the ``mode`` parameter passed - in at construction is ``max``, the heap is a Max-Heap and mode is set - to 1. When the value of the ``mode`` parameter passed in at - construction is not ``max``, the heap is a Min-Heap and mode is set - to 0. + array_ : WeightedPQueueRecord* + The array of array records. The minimum element is on the left; + the array grows from left to right """ - def __cinit__(self, SIZE_t capacity, str mode): + def __cinit__(self, SIZE_t capacity): self.capacity = capacity - if mode == "max": - self.mode = 1 - else: - self.mode = 0 + self.array_ptr = 0 - self.heap_ptr = 0 + self.array_ = calloc(capacity, sizeof(WeightedPQueueRecord)) - self.heap_ = calloc(capacity, sizeof(MinMaxHeapRecord)) - if self.heap_ == NULL: + if self.array_ == NULL: raise MemoryError() def __dealloc__(self): - free(self.heap_) + free(self.array_) cdef bint is_empty(self) nogil: - return self.heap_ptr <= 0 + return self.array_ptr <= 0 cdef SIZE_t size(self) nogil: - return self.heap_ptr - - cdef int push(self, DOUBLE_t data) nogil: - """Push record on the priority heap. + return self.array_ptr + cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil: + """Push record on the array. Returns 0 if successful; -1 on out of memory error. """ - cdef SIZE_t heap_ptr = self.heap_ptr + cdef SIZE_t array_ptr = self.array_ptr cdef SIZE_t i - cdef MinMaxHeapRecord* heap = NULL + cdef WeightedPQueueRecord* array # Resize if capacity not sufficient - if heap_ptr >= self.capacity: + if array_ptr >= self.capacity: self.capacity *= 2 - heap = realloc(self.heap_, + array = realloc(self.array_, self.capacity * - sizeof(MinMaxHeapRecord)) - if heap == NULL: + sizeof(WeightedPQueueRecord)) + + if array == NULL: # no free; __dealloc__ handles that return -1 - self.heap_ = heap + self.array_ = array - # Put element as last element of heap - heap = self.heap_ - heap[heap_ptr].data = data - - # bubble last element up according to mode - # max heap, sorted in descending order - i = heap_ptr - if self.mode == 1: - while(i != 0 and heap[i].data > heap[i-1].data): - heap[i], heap[i-1] = heap[i-1], heap[i] - i = i-1 - - # min heap, sorted in ascending order - else: - while(i != 0 and heap[i].data < heap[i-1].data): - heap[i], heap[i-1] = heap[i-1], heap[i] - i = i-1 + # Put element as last element of array + array = self.array_ + array[array_ptr].data = data + array[array_ptr].weight = weight + + # bubble last element up according until it is sorted + # in ascending order + i = array_ptr + while(i != 0 and array[i].data < array[i-1].data): + array[i], array[i-1] = array[i-1], array[i] + i -= 1 # Increase element count - self.heap_ptr = heap_ptr + 1 + self.array_ptr = array_ptr + 1 return 0 - cdef int remove(self, DOUBLE_t value) nogil: - """Remove a specific value from heap""" - cdef SIZE_t heap_ptr = self.heap_ptr - cdef MinMaxHeapRecord* heap = self.heap_ + cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil: + """Remove a specific value from array""" + cdef SIZE_t array_ptr = self.array_ptr + cdef WeightedPQueueRecord* array = self.array_ cdef SIZE_t idx_to_remove = -1 cdef SIZE_t i - if heap_ptr <= 0: + if array_ptr <= 0: return -1 # find element to remove - for i in range(heap_ptr): - if heap[i].data == value: + for i in range(array_ptr): + if array[i].data == value and array[i].weight == weight: idx_to_remove = i break + # should we throw an error if the element isn't found? # it shouldn't happen, but better to fail noisily...? + if idx_to_remove == -1: + with gil: + raise ValueError() # move after the removed element over by one - for i in range(idx_to_remove, heap_ptr-1): - heap[i] = heap[i+1] + for i in range(idx_to_remove, array_ptr-1): + array[i] = array[i+1] - self.heap_ptr = heap_ptr - 1 + self.array_ptr = array_ptr - 1 return 0 - cdef int pop(self, DOUBLE_t* res) nogil: - """Remove top element from heap.""" - cdef SIZE_t heap_ptr = self.heap_ptr - cdef MinMaxHeapRecord* heap = self.heap_ + cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil: + """Remove top element from array.""" + cdef SIZE_t array_ptr = self.array_ptr + cdef WeightedPQueueRecord* array = self.array_ cdef SIZE_t i - if heap_ptr <= 0: + if array_ptr <= 0: return -1 # Take first element - res[0] = heap[0].data + data[0] = array[0].data + weight[0] = array[0].weight # move after the removed element over by one - for i in range(0, heap_ptr-1): - heap[i] = heap[i+1] + for i in range(0, array_ptr-1): + array[i] = array[i+1] - self.heap_ptr = heap_ptr - 1 + self.array_ptr = array_ptr - 1 return 0 - cdef int peek(self, DOUBLE_t* res) nogil: - """Write the top element from heap to a pointer.""" - cdef SIZE_t heap_ptr = self.heap_ptr - cdef MinMaxHeapRecord* heap = self.heap_ - if heap_ptr <= 0: + cdef int peek(self, DOUBLE_t* data, DOUBLE_t* weight) nogil: + """Write the top element from array to a pointer.""" + cdef SIZE_t array_ptr = self.array_ptr + cdef WeightedPQueueRecord* array = self.array_ + if array_ptr <= 0: return -1 # Take first value - res[0] = heap[0].data + data[0] = array[0].data + weight[0] = array[0].weight + return 0 + + cdef int get_index_data(self, SIZE_t idx, DOUBLE_t* value, + DOUBLE_t* weight) nogil: + """Write value and weight at the specified index to a pointer.""" + cdef SIZE_t array_ptr = self.array_ptr + cdef WeightedPQueueRecord* array = self.array_ + + if array_ptr <= 0: + return -1 + # Take value at idx + value[0] = array[idx].data + + # Take weight at idx + weight[0] = array[idx].weight return 0 # ============================================================================= -# MedianHeap data structure +# WeightedMedianHeap data structure # ============================================================================= -cdef class MedianHeap: +cdef class WeightedMedianHeap: def __cinit__(self, SIZE_t initial_capacity): self.initial_capacity = initial_capacity self.current_capacity = 0 - self.left_max_heap = MinMaxHeap(initial_capacity, "max") - self.right_min_heap = MinMaxHeap(initial_capacity, "min") + self.samples = WeightedPQueue(initial_capacity) + self.total_weight = 0 + self.k = 0 + self.sum_w_0_k = 0 cdef SIZE_t size(self) nogil: return self.current_capacity - cdef int push(self, DOUBLE_t data) nogil: - """Push a value to the MedianHeap to be considered + cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil: + """Push a value and its associated weight + to the WeightedMedianHeap to be considered in the median calculation """ cdef double current_median cdef int return_value - if self.current_capacity == 0: - return_value = self.left_max_heap.push(data) - else: - self.get_median(¤t_median) - if current_median <= data: - # data is greater than or equal to current median, so it goes on min heap - return_value = self.right_min_heap.push(data) - else: - # data is less than current median, so it goes on max heap - return_value = self.left_max_heap.push(data) - self.rebalance() + return_value = self.samples.push(data, weight) self.current_capacity += 1 + self.total_weight += weight + self.update_median_parameters_post_push(data, weight) return return_value - cdef int remove(self, DOUBLE_t data) nogil: + cdef int update_median_parameters_post_push(self, DOUBLE_t data, + DOUBLE_t weight) nogil: + """Update the parameters used in the median calculation, + namely `k` and `sum_w_0_k` after an insertion""" + cdef double current_median + + # trivial case of one element. + if self.current_capacity == 1: + self.k = 1 + self.sum_w_0_k = self.total_weight + return 0 + + # get the current weighted median + self.get_median(¤t_median) + + # check if the value inserted is the same as the current median + if data == current_median: + # k stays the same, but add weight to sum_w_0_k + self.sum_w_0_k += weight + return 0 + + if data < current_median: + # inserting below the median, so increment k and + # then update self.sum_w_0_k accordingly by adding + # the weight that was added. + self.k += 1 + # update sum_w_0_k by adding the weight added + self.sum_w_0_k += weight + + # minimize k such that sum(W[0:k]) >= total_weight / 2 + # minimum value of k is 1 + while(self.k != 1 and (self.sum_w_0_k - self.get_weight_from_index(self.k-1) >= self.total_weight / 2)): + # ordering of these statements is very important + self.k -= 1 + self.sum_w_0_k -= self.get_weight_from_index(self.k) + return 0 + + if data > current_median: + # inserting above the median + # minimize k such that sum(W[0:k]) >= total_weight / 2 + while(self.k != self.current_capacity and self.sum_w_0_k < self.total_weight / 2): + self.k += 1 + self.sum_w_0_k += self.get_weight_from_index(self.k-1) + return 0 + + cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil: + """Given an index between [0,self.current_capacity], access + the appropriate heap and return the requested weight""" + cdef DOUBLE_t value + cdef DOUBLE_t weight + + self.samples.get_index_data(index, &value, &weight) + return weight + + cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil: + """Given an index between [0,self.current_capacity], access + the appropriate heap and return the requested value""" + cdef DOUBLE_t value + cdef DOUBLE_t weight + + self.samples.get_index_data(index, &value, &weight) + return value + + cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil: """Remove a value from the MedianHeap, removing it from consideration in the median calculation """ - cdef double current_median + cdef double current_unweighted_median cdef int return_value - self.get_median(¤t_median) - if current_median == data: - # data is the same value as current median, it is in - # the bigger one - if self.right_min_heap.size() > self.left_max_heap.size(): - # it is in the right - return_value = self.right_min_heap.remove(data) - else: - # it is in the left - return_value = self.left_max_heap.remove(data) - elif current_median < data: - # data is greater than or equal to current median, so it is on min heap - return_value = self.right_min_heap.remove(data) - else: - # data is less than current median, so it is on max heap - return_value = self.left_max_heap.remove(data) - self.rebalance() + return_value = self.samples.remove(data, weight) self.current_capacity -= 1 + self.total_weight -= weight + self.update_median_parameters_post_remove(data, weight) return return_value - cdef int pop(self, DOUBLE_t* res) nogil: + cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil: """Pop a value from the MedianHeap, starting from the left and moving to the right. """ @@ -526,75 +569,68 @@ cdef class MedianHeap: if self.current_capacity == 0: return -1 - if self.left_max_heap.size() != 0: - # pop from the left - return_value = self.left_max_heap.pop(res) - elif self.right_min_heap.size() != 0: - # pop from right - return_value = self.right_min_heap.pop(res) - else: - return -1 - self.rebalance() + return_value = self.samples.pop(data, weight) self.current_capacity -= 1 + self.total_weight -= weight[0] + self.update_median_parameters_post_remove(data[0], + weight[0]) return return_value - cdef int get_median(self, double* data) nogil: - """Return the current median""" - if self.current_capacity == 0: - return -1 + cdef int update_median_parameters_post_remove(self, DOUBLE_t data, + DOUBLE_t weight) nogil: + """Update the parameters used in the median calculation, + namely `k` and `sum_w_0_k` after a removal""" + cdef DOUBLE_t current_median + # trivial case of one element. + if self.current_capacity == 1: + self.k = 1 + self.sum_w_0_k = self.total_weight + return 0 + + # get the current weighted median + self.get_median(¤t_median) - cdef SIZE_t left_max_heap_size = self.left_max_heap.size() - cdef SIZE_t right_min_heap_size = self.right_min_heap.size() - cdef DOUBLE_t left_max_heap_median - cdef DOUBLE_t right_min_heap_median - - if self.current_capacity < 2: - # there is only one thing, so set the median to be that - if left_max_heap_size >= 1: - self.left_max_heap.peek(&left_max_heap_median) - data[0] = left_max_heap_median - else: - self.right_min_heap.peek(&right_min_heap_median) - data[0] = right_min_heap_median + # check if the value removed is the same as the current median + if data == current_median: + # k stays the same, but remove weight from sum_w_0_k + self.sum_w_0_k -= weight return 0 - self.left_max_heap.peek(&left_max_heap_median) - self.right_min_heap.peek(&right_min_heap_median) - - if left_max_heap_size == right_min_heap_size: - # take the average of the two - data[0] = (left_max_heap_median + - right_min_heap_median) / 2.0 - elif left_max_heap_size > right_min_heap_size: - # left max heap larger, so median is at its' top - data[0] = left_max_heap_median - else: - # right min heap is larger, so median is at its' top - data[0] = right_min_heap_median - return 0 - cdef int rebalance(self) nogil: - """Rebalance the left max heap and the left min heap to have a - one element or less difference in size""" - cdef SIZE_t left_max_heap_size = self.left_max_heap.size() - cdef SIZE_t right_min_heap_size = self.right_min_heap.size() - cdef SIZE_t size_difference = left_max_heap_size - right_min_heap_size - cdef DOUBLE_t popped - cdef SIZE_t i + if data < current_median: + # removing below the median, so decrement k and + # then update self.sum_w_0_k accordingly by subtracting + # the removed weight + self.k -= 1 + # update sum_w_0_k by removing the weight at index k + self.sum_w_0_k -= weight + + # minimize k such that sum(W[0:k]) >= total_weight / 2 + # by incrementing k and updating sum_w_0_k accordingly + # until the condition is met. + while(self.k != self.current_capacity and self.sum_w_0_k < self.total_weight / 2): + # ordering of these statements is very important + self.k += 1 + self.sum_w_0_k += self.get_weight_from_index(self.k-1) + return 0 - if size_difference >= -1 and size_difference <= 1: - # no balancing needed + if data > current_median: + # removing above the median + # minimize k such that sum(W[0:k]) >= total_weight / 2 + while(self.k != 1 and self.sum_w_0_k - self.get_weight_from_index(self.k-1) >= self.total_weight / 2): + # mind the ordering + self.k -= 1 + self.sum_w_0_k -= self.get_weight_from_index(self.k) return 0 - if size_difference > 1: - # left max heap bigger - for i in range(0, size_difference - 1): - # pop from left max heap and push into right min heap - self.left_max_heap.pop(&popped) - self.right_min_heap.push(popped) - else: - # right min heap bigger - for i in range(0, (size_difference * -1) - 1): - # pop from right min heap and push into left max heap - self.right_min_heap.pop(&popped) - self.left_max_heap.push(popped) + cdef int get_median(self, double* median) nogil: + """Write the median to a pointer, taking into account + sample weights.""" + if self.sum_w_0_k < (self.total_weight / 2.0): + return -1 + if self.sum_w_0_k == (self.total_weight / 2.0): + # split median + median[0] = (self.get_value_from_index(self.k) + self.get_value_from_index(self.k-1))/2 + if self.sum_w_0_k > (self.total_weight / 2.0): + # whole median + median[0] = self.get_value_from_index(self.k-1) return 0 From 9920cfcd91c44a771d1a7d50f001a73382428338 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 12:00:08 -0500 Subject: [PATCH 42/75] fix errors wrt median insertion case --- sklearn/tree/_criterion.pyx | 14 +++- sklearn/tree/_utils.pxd | 7 +- sklearn/tree/_utils.pyx | 139 +++++++++++++++++++----------------- 3 files changed, 91 insertions(+), 69 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 15372f686852a..2367bcb5e1c58 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1038,7 +1038,6 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t k cdef DOUBLE_t y_ik cdef DOUBLE_t w = 1.0 - # Fill accumulators with MedianHeaps with gil: for k in range(self.n_outputs): @@ -1221,6 +1220,10 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t start = self.start cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end + # with gil: + # print "start {}".format(start) + # print "pos {}".format(pos) + # print "end {}".format(end) cdef SIZE_t i, p, k cdef DOUBLE_t y_ik @@ -1234,6 +1237,8 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): ( left_child_heaps[k]).get_median(&median) + # with gil: + # print "median {}".format(median) for p in range(start, pos): i = samples[p] @@ -1241,15 +1246,22 @@ cdef class MAE(RegressionCriterion): impurity_left[0] += fabs(( y_ik) - median) impurity_left[0] /= ((pos - start) * self.n_outputs) + # with gil: + # print "impurity_left[0] {}".format(impurity_left[0]) for k in range(self.n_outputs): ( right_child_heaps[k]).get_median(&median) + # with gil: + # print "median {}".format(median) for p in range(pos, end): i = samples[p] y_ik = y[i * self.y_stride + k] + impurity_right[0] += fabs(( y_ik) - median) impurity_right[0] /= ((end - pos) * self.n_outputs) + # with gil: + # print "impurity_right[0] {}".format(impurity_right[0]) cdef class FriedmanMSE(MSE): diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 9d5b8a725b371..2f31f915a3191 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -128,7 +128,9 @@ cdef class WeightedPQueue: cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil cdef int peek(self, DOUBLE_t* res, DOUBLE_t* weight) nogil - cdef int get_index_data(self, SIZE_t idx, DOUBLE_t* value, DOUBLE_t* weight) nogil + cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil + cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil + # ============================================================================= # MedianHeap data structure @@ -136,7 +138,6 @@ cdef class WeightedPQueue: cdef class WeightedMedianHeap: cdef SIZE_t initial_capacity - cdef SIZE_t current_capacity cdef WeightedPQueue samples cdef DOUBLE_t total_weight cdef SIZE_t k @@ -146,8 +147,6 @@ cdef class WeightedMedianHeap: cdef SIZE_t size(self) nogil cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil cdef int update_median_parameters_post_push(self, DOUBLE_t data, DOUBLE_t weight) nogil - cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil - cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil cdef int update_median_parameters_post_remove(self, DOUBLE_t data, DOUBLE_t weight) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 17e9361fc8f6c..7cd536080bc77 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -326,7 +326,6 @@ cdef class WeightedPQueue: def __cinit__(self, SIZE_t capacity): self.capacity = capacity self.array_ptr = 0 - self.array_ = calloc(capacity, sizeof(WeightedPQueueRecord)) if self.array_ == NULL: @@ -346,15 +345,15 @@ cdef class WeightedPQueue: Returns 0 if successful; -1 on out of memory error. """ cdef SIZE_t array_ptr = self.array_ptr + cdef WeightedPQueueRecord* array = NULL cdef SIZE_t i - cdef WeightedPQueueRecord* array # Resize if capacity not sufficient if array_ptr >= self.capacity: self.capacity *= 2 array = realloc(self.array_, - self.capacity * - sizeof(WeightedPQueueRecord)) + self.capacity * + sizeof(WeightedPQueueRecord)) if array == NULL: # no free; __dealloc__ handles that @@ -437,20 +436,35 @@ cdef class WeightedPQueue: weight[0] = array[0].weight return 0 - cdef int get_index_data(self, SIZE_t idx, DOUBLE_t* value, - DOUBLE_t* weight) nogil: - """Write value and weight at the specified index to a pointer.""" + cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil: + """Given an index between [0,self.current_capacity], access + the appropriate heap and return the requested weight""" cdef SIZE_t array_ptr = self.array_ptr cdef WeightedPQueueRecord* array = self.array_ - if array_ptr <= 0: + if array_ptr <= 0 or index >= array_ptr: + with gil: + print index + print array_ptr + print "FALIED ON WEIGHT" return -1 - # Take value at idx - value[0] = array[idx].data - # Take weight at idx - weight[0] = array[idx].weight - return 0 + return array[index].weight + + cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil: + """Given an index between [0,self.current_capacity], access + the appropriate heap and return the requested value""" + cdef SIZE_t array_ptr = self.array_ptr + cdef WeightedPQueueRecord* array = self.array_ + + if array_ptr <= 0 or index >= array_ptr: + with gil: + print index + print array_ptr + print "FALIED ON VALUE" + return -1 + # Take value at idx + return array[index].data # ============================================================================= # WeightedMedianHeap data structure @@ -460,14 +474,13 @@ cdef class WeightedMedianHeap: def __cinit__(self, SIZE_t initial_capacity): self.initial_capacity = initial_capacity - self.current_capacity = 0 self.samples = WeightedPQueue(initial_capacity) self.total_weight = 0 self.k = 0 self.sum_w_0_k = 0 cdef SIZE_t size(self) nogil: - return self.current_capacity + return self.samples.size() cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil: """Push a value and its associated weight @@ -478,8 +491,6 @@ cdef class WeightedMedianHeap: cdef int return_value return_value = self.samples.push(data, weight) - self.current_capacity += 1 - self.total_weight += weight self.update_median_parameters_post_push(data, weight) return return_value @@ -490,19 +501,21 @@ cdef class WeightedMedianHeap: cdef double current_median # trivial case of one element. - if self.current_capacity == 1: + if self.size() == 1: self.k = 1 + self.total_weight = weight self.sum_w_0_k = self.total_weight return 0 - # get the current weighted median + # get the original weighted median self.get_median(¤t_median) - + self.total_weight += weight # check if the value inserted is the same as the current median - if data == current_median: - # k stays the same, but add weight to sum_w_0_k - self.sum_w_0_k += weight - return 0 + # if data == current_median: + # # k stays the same, but add weight to sum_w_0_k + # self.k += 1 + # self.sum_w_0_k += weight + # return 0 if data < current_median: # inserting below the median, so increment k and @@ -514,38 +527,21 @@ cdef class WeightedMedianHeap: # minimize k such that sum(W[0:k]) >= total_weight / 2 # minimum value of k is 1 - while(self.k != 1 and (self.sum_w_0_k - self.get_weight_from_index(self.k-1) >= self.total_weight / 2)): + while(self.k > 1 and (self.sum_w_0_k - self.samples.get_weight_from_index(self.k-1) >= self.total_weight / 2.0)): # ordering of these statements is very important self.k -= 1 - self.sum_w_0_k -= self.get_weight_from_index(self.k) + self.sum_w_0_k -= self.samples.get_weight_from_index(self.k) + return 0 - if data > current_median: + if data >= current_median: # inserting above the median # minimize k such that sum(W[0:k]) >= total_weight / 2 - while(self.k != self.current_capacity and self.sum_w_0_k < self.total_weight / 2): + while(self.k < self.samples.size() and self.sum_w_0_k < self.total_weight / 2.0): self.k += 1 - self.sum_w_0_k += self.get_weight_from_index(self.k-1) + self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1) return 0 - cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil: - """Given an index between [0,self.current_capacity], access - the appropriate heap and return the requested weight""" - cdef DOUBLE_t value - cdef DOUBLE_t weight - - self.samples.get_index_data(index, &value, &weight) - return weight - - cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil: - """Given an index between [0,self.current_capacity], access - the appropriate heap and return the requested value""" - cdef DOUBLE_t value - cdef DOUBLE_t weight - - self.samples.get_index_data(index, &value, &weight) - return value - cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil: """Remove a value from the MedianHeap, removing it from consideration in the median calculation @@ -554,8 +550,6 @@ cdef class WeightedMedianHeap: cdef int return_value return_value = self.samples.remove(data, weight) - self.current_capacity -= 1 - self.total_weight -= weight self.update_median_parameters_post_remove(data, weight) return return_value @@ -566,12 +560,10 @@ cdef class WeightedMedianHeap: cdef int return_value # no elements to pop - if self.current_capacity == 0: + if self.samples.size() == 0: return -1 return_value = self.samples.pop(data, weight) - self.current_capacity -= 1 - self.total_weight -= weight[0] self.update_median_parameters_post_remove(data[0], weight[0]) return return_value @@ -581,25 +573,38 @@ cdef class WeightedMedianHeap: """Update the parameters used in the median calculation, namely `k` and `sum_w_0_k` after a removal""" cdef DOUBLE_t current_median + # reset parameters because empty + if self.samples.size() == 0: + self.k = 0 + self.total_weight = 0 + self.sum_w_0_k = 0 + return 0 + # trivial case of one element. - if self.current_capacity == 1: + if self.samples.size() == 1: self.k = 1 + self.total_weight -= weight self.sum_w_0_k = self.total_weight return 0 # get the current weighted median self.get_median(¤t_median) + self.total_weight -= weight # check if the value removed is the same as the current median - if data == current_median: - # k stays the same, but remove weight from sum_w_0_k - self.sum_w_0_k -= weight - return 0 + # if data == current_median: + # # with gil: + # # print "removing at median" + # # k stays the same, but remove weight from sum_w_0_k + # self.sum_w_0_k -= weight + # return 0 if data < current_median: # removing below the median, so decrement k and # then update self.sum_w_0_k accordingly by subtracting # the removed weight + # with gil: + # print "removing below median" self.k -= 1 # update sum_w_0_k by removing the weight at index k self.sum_w_0_k -= weight @@ -607,30 +612,36 @@ cdef class WeightedMedianHeap: # minimize k such that sum(W[0:k]) >= total_weight / 2 # by incrementing k and updating sum_w_0_k accordingly # until the condition is met. - while(self.k != self.current_capacity and self.sum_w_0_k < self.total_weight / 2): + while(self.k < self.samples.size() and self.sum_w_0_k < self.total_weight / 2.0): # ordering of these statements is very important self.k += 1 - self.sum_w_0_k += self.get_weight_from_index(self.k-1) + self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1) return 0 - if data > current_median: + if data >= current_median: # removing above the median + # with gil: + # print "removing above median" # minimize k such that sum(W[0:k]) >= total_weight / 2 - while(self.k != 1 and self.sum_w_0_k - self.get_weight_from_index(self.k-1) >= self.total_weight / 2): + while(self.k > 1 and self.sum_w_0_k - self.samples.get_weight_from_index(self.k-1) >= self.total_weight / 2.0): # mind the ordering self.k -= 1 - self.sum_w_0_k -= self.get_weight_from_index(self.k) + self.sum_w_0_k -= self.samples.get_weight_from_index(self.k) return 0 cdef int get_median(self, double* median) nogil: """Write the median to a pointer, taking into account sample weights.""" + # with gil: + # print "entered get_median" if self.sum_w_0_k < (self.total_weight / 2.0): + # with gil: + # raise ValueError() return -1 if self.sum_w_0_k == (self.total_weight / 2.0): # split median - median[0] = (self.get_value_from_index(self.k) + self.get_value_from_index(self.k-1))/2 + median[0] = ( (self.samples.get_value_from_index(self.k) + self.samples.get_value_from_index(self.k-1)) / 2.0) if self.sum_w_0_k > (self.total_weight / 2.0): # whole median - median[0] = self.get_value_from_index(self.k-1) + median[0] = self.samples.get_value_from_index(self.k-1) return 0 From 53207d48245c3424042c15e52fed7d9338f34221 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 12:11:47 -0500 Subject: [PATCH 43/75] spurious comment to force recythonization --- sklearn/tree/_splitter.pyx | 2 +- sklearn/tree/_tree.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 0617508aab236..b449f5aecbd8b 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -35,7 +35,7 @@ from ._utils cimport RAND_R_MAX from ._utils cimport safe_realloc cdef double INFINITY = np.inf - +# SPURIOUS COMMENT TO FORCE RECYTHONIZE. REMOVE ME. # Mitigate precision differences between 32 bit and 64 bit cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f44320a7b47ae..38eb4815ff5e1 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -36,7 +36,7 @@ from ._utils cimport PriorityHeap from ._utils cimport PriorityHeapRecord from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray - +# SPURIOUS COMMENT TO FORCE RECYTHONIZE. REMOVE ME. cdef extern from "numpy/arrayobject.h": object PyArray_NewFromDescr(object subtype, np.dtype descr, int nd, np.npy_intp* dims, From 69072274d66871124f32840624715b178cd3d8df Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 12:39:29 -0500 Subject: [PATCH 44/75] general code cleanup --- sklearn/tree/_splitter.pyx | 2 +- sklearn/tree/_tree.pyx | 4 +- sklearn/tree/_utils.pxd | 4 +- sklearn/tree/_utils.pyx | 83 +++++++++++++++----------------------- 4 files changed, 37 insertions(+), 56 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index b449f5aecbd8b..0617508aab236 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -35,7 +35,7 @@ from ._utils cimport RAND_R_MAX from ._utils cimport safe_realloc cdef double INFINITY = np.inf -# SPURIOUS COMMENT TO FORCE RECYTHONIZE. REMOVE ME. + # Mitigate precision differences between 32 bit and 64 bit cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 38eb4815ff5e1..0dd1a6c92083b 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1,4 +1,4 @@ -# cython: cdivision=True +# cython: cdivision=Truex # cython: boundscheck=False # cython: wraparound=False @@ -36,7 +36,7 @@ from ._utils cimport PriorityHeap from ._utils cimport PriorityHeapRecord from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray -# SPURIOUS COMMENT TO FORCE RECYTHONIZE. REMOVE ME. + cdef extern from "numpy/arrayobject.h": object PyArray_NewFromDescr(object subtype, np.dtype descr, int nd, np.npy_intp* dims, diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 2f31f915a3191..fe803ba2fe779 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -141,8 +141,8 @@ cdef class WeightedMedianHeap: cdef WeightedPQueue samples cdef DOUBLE_t total_weight cdef SIZE_t k - cdef DOUBLE_t sum_w_0_k # represents sum(weights[0:k]) - # = w[0] + w[1] + ... + w[k-1] + cdef DOUBLE_t sum_w_0_k # represents sum(weights[0:k]) + # = w[0] + w[1] + ... + w[k-1] cdef SIZE_t size(self) nogil cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 7cd536080bc77..c38196b38cab9 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -377,7 +377,8 @@ cdef class WeightedPQueue: return 0 cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil: - """Remove a specific value from array""" + """Remove a specific value/weight record from the array. + Returns 0 if successful, -1 if record not found.""" cdef SIZE_t array_ptr = self.array_ptr cdef WeightedPQueueRecord* array = self.array_ cdef SIZE_t idx_to_remove = -1 @@ -392,13 +393,11 @@ cdef class WeightedPQueue: idx_to_remove = i break - # should we throw an error if the element isn't found? - # it shouldn't happen, but better to fail noisily...? if idx_to_remove == -1: - with gil: - raise ValueError() + return -1 - # move after the removed element over by one + # shift the elements after the removed element + # to the left. for i in range(idx_to_remove, array_ptr-1): array[i] = array[i+1] @@ -406,7 +405,8 @@ cdef class WeightedPQueue: return 0 cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil: - """Remove top element from array.""" + """Remove the top (minimum) element from array. + Returns 0 if successful, -1 if nothing to remove.""" cdef SIZE_t array_ptr = self.array_ptr cdef WeightedPQueueRecord* array = self.array_ cdef SIZE_t i @@ -414,11 +414,11 @@ cdef class WeightedPQueue: if array_ptr <= 0: return -1 - # Take first element data[0] = array[0].data weight[0] = array[0].weight - # move after the removed element over by one + # shift the elements after the removed element + # to the left. for i in range(0, array_ptr-1): array[i] = array[i+1] @@ -426,7 +426,8 @@ cdef class WeightedPQueue: return 0 cdef int peek(self, DOUBLE_t* data, DOUBLE_t* weight) nogil: - """Write the top element from array to a pointer.""" + """Write the top element from array to a pointer. + Returns 0 if successful, -1 if nothing to write.""" cdef SIZE_t array_ptr = self.array_ptr cdef WeightedPQueueRecord* array = self.array_ if array_ptr <= 0: @@ -444,11 +445,9 @@ cdef class WeightedPQueue: if array_ptr <= 0 or index >= array_ptr: with gil: - print index - print array_ptr - print "FALIED ON WEIGHT" - return -1 - # Take weight at idx + raise ValueError("Tried to access element " + "at index out of bounds.") + # get weight at index return array[index].weight cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil: @@ -459,11 +458,9 @@ cdef class WeightedPQueue: if array_ptr <= 0 or index >= array_ptr: with gil: - print index - print array_ptr - print "FALIED ON VALUE" - return -1 - # Take value at idx + raise ValueError("Tried to access element " + "at index out of bounds.") + # get value at index return array[index].data # ============================================================================= @@ -480,6 +477,7 @@ cdef class WeightedMedianHeap: self.sum_w_0_k = 0 cdef SIZE_t size(self) nogil: + """Return the number of samples in the WeightedMedianHeap""" return self.samples.size() cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil: @@ -510,12 +508,6 @@ cdef class WeightedMedianHeap: # get the original weighted median self.get_median(¤t_median) self.total_weight += weight - # check if the value inserted is the same as the current median - # if data == current_median: - # # k stays the same, but add weight to sum_w_0_k - # self.k += 1 - # self.sum_w_0_k += weight - # return 0 if data < current_median: # inserting below the median, so increment k and @@ -527,17 +519,19 @@ cdef class WeightedMedianHeap: # minimize k such that sum(W[0:k]) >= total_weight / 2 # minimum value of k is 1 - while(self.k > 1 and (self.sum_w_0_k - self.samples.get_weight_from_index(self.k-1) >= self.total_weight / 2.0)): - # ordering of these statements is very important + while(self.k > 1 and (self.sum_w_0_k - + self.samples.get_weight_from_index(self.k-1) + >= self.total_weight / 2.0)): self.k -= 1 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k) return 0 if data >= current_median: - # inserting above the median + # inserting above or at the median # minimize k such that sum(W[0:k]) >= total_weight / 2 - while(self.k < self.samples.size() and self.sum_w_0_k < self.total_weight / 2.0): + while(self.k < self.samples.size() and + (self.sum_w_0_k < self.total_weight / 2.0)): self.k += 1 self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1) return 0 @@ -591,20 +585,11 @@ cdef class WeightedMedianHeap: self.get_median(¤t_median) self.total_weight -= weight - # check if the value removed is the same as the current median - # if data == current_median: - # # with gil: - # # print "removing at median" - # # k stays the same, but remove weight from sum_w_0_k - # self.sum_w_0_k -= weight - # return 0 - if data < current_median: # removing below the median, so decrement k and # then update self.sum_w_0_k accordingly by subtracting # the removed weight - # with gil: - # print "removing below median" + self.k -= 1 # update sum_w_0_k by removing the weight at index k self.sum_w_0_k -= weight @@ -612,19 +597,18 @@ cdef class WeightedMedianHeap: # minimize k such that sum(W[0:k]) >= total_weight / 2 # by incrementing k and updating sum_w_0_k accordingly # until the condition is met. - while(self.k < self.samples.size() and self.sum_w_0_k < self.total_weight / 2.0): - # ordering of these statements is very important + while(self.k < self.samples.size() and + (self.sum_w_0_k < self.total_weight / 2.0)): self.k += 1 self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1) return 0 if data >= current_median: # removing above the median - # with gil: - # print "removing above median" # minimize k such that sum(W[0:k]) >= total_weight / 2 - while(self.k > 1 and self.sum_w_0_k - self.samples.get_weight_from_index(self.k-1) >= self.total_weight / 2.0): - # mind the ordering + while(self.k > 1 and ((self.sum_w_0_k - + self.samples.get_weight_from_index(self.k-1)) + >= self.total_weight / 2.0)): self.k -= 1 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k) return 0 @@ -632,15 +616,12 @@ cdef class WeightedMedianHeap: cdef int get_median(self, double* median) nogil: """Write the median to a pointer, taking into account sample weights.""" - # with gil: - # print "entered get_median" if self.sum_w_0_k < (self.total_weight / 2.0): - # with gil: - # raise ValueError() return -1 if self.sum_w_0_k == (self.total_weight / 2.0): # split median - median[0] = ( (self.samples.get_value_from_index(self.k) + self.samples.get_value_from_index(self.k-1)) / 2.0) + median[0] = (self.samples.get_value_from_index(self.k) + + self.samples.get_value_from_index(self.k-1)) / 2.0 if self.sum_w_0_k > (self.total_weight / 2.0): # whole median median[0] = self.samples.get_value_from_index(self.k-1) From 8d550979933e2eabcf192a44612d46e06845446e Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 14:02:38 -0500 Subject: [PATCH 45/75] fix typo in _tree.pyx --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 0dd1a6c92083b..f44320a7b47ae 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1,4 +1,4 @@ -# cython: cdivision=Truex +# cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False From b465abc06ad605565f6e93a0a8bbd4a61bcb0649 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 14:49:30 -0500 Subject: [PATCH 46/75] removed some extraneous comments --- sklearn/tree/_criterion.pyx | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 2367bcb5e1c58..0ae7163770bf8 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1220,10 +1220,6 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t start = self.start cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end - # with gil: - # print "start {}".format(start) - # print "pos {}".format(pos) - # print "end {}".format(end) cdef SIZE_t i, p, k cdef DOUBLE_t y_ik @@ -1237,8 +1233,6 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): ( left_child_heaps[k]).get_median(&median) - # with gil: - # print "median {}".format(median) for p in range(start, pos): i = samples[p] @@ -1246,13 +1240,9 @@ cdef class MAE(RegressionCriterion): impurity_left[0] += fabs(( y_ik) - median) impurity_left[0] /= ((pos - start) * self.n_outputs) - # with gil: - # print "impurity_left[0] {}".format(impurity_left[0]) for k in range(self.n_outputs): ( right_child_heaps[k]).get_median(&median) - # with gil: - # print "median {}".format(median) for p in range(pos, end): i = samples[p] @@ -1260,8 +1250,6 @@ cdef class MAE(RegressionCriterion): impurity_right[0] += fabs(( y_ik) - median) impurity_right[0] /= ((end - pos) * self.n_outputs) - # with gil: - # print "impurity_right[0] {}".format(impurity_right[0]) cdef class FriedmanMSE(MSE): From df9e64a5456215c2240e4b6ada0bd318b85a2293 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 16:38:41 -0500 Subject: [PATCH 47/75] [ci skip] remove earlier microchanges --- sklearn/tree/_criterion.pyx | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 0ae7163770bf8..ed037dd3aa57d 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -814,7 +814,6 @@ cdef class RegressionCriterion(Criterion): cdef SIZE_t k cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik - cdef DOUBLE_t w_y_ik # Update statistics up to new_pos # @@ -833,8 +832,7 @@ cdef class RegressionCriterion(Criterion): for k in range(self.n_outputs): y_ik = y[i * self.y_stride + k] - w_y_ik = w * y_ik - sum_left[k] += w_y_ik + sum_left[k] += w * y_ik self.weighted_n_left += w else: @@ -848,8 +846,7 @@ cdef class RegressionCriterion(Criterion): for k in range(self.n_outputs): y_ik = y[i * self.y_stride + k] - w_y_ik = w * y_ik - sum_left[k] -= w_y_ik + sum_left[k] -= w * y_ik self.weighted_n_left -= w From 32c1fefcef3c0c1a154c08731040236304e85a72 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 16:41:11 -0500 Subject: [PATCH 48/75] [ci skip] remove change to priorityheap --- sklearn/tree/_utils.pyx | 65 +++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index c38196b38cab9..2c1b3fb2b79d1 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -173,36 +173,6 @@ cdef class Stack: # PriorityHeap data structure # ============================================================================= -cdef class PriorityHeap: - """A priority queue implemented as a binary heap. - - The heap invariant is that the impurity improvement of the parent record - is larger then the impurity improvement of the children. - - Attributes - ---------- - capacity : SIZE_t - The capacity of the heap - - heap_ptr : SIZE_t - The water mark of the heap; the heap grows from left to right in the - array ``heap_``. The following invariant holds ``heap_ptr < capacity``. - - heap_ : PriorityHeapRecord* - The array of heap records. The maximum element is on the left; - the heap grows from left to right - """ - - def __cinit__(self, SIZE_t capacity): - self.capacity = capacity - self.heap_ptr = 0 - self.heap_ = malloc(capacity * sizeof(PriorityHeapRecord)) - if self.heap_ == NULL: - raise MemoryError() - - def __dealloc__(self): - free(self.heap_) - cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil: """Restore heap invariant parent.improvement > child.improvement from ``pos`` upwards. """ @@ -235,6 +205,37 @@ cdef class PriorityHeap: heap[pos], heap[largest] = heap[largest], heap[pos] self.heapify_down(heap, largest, heap_length) + +cdef class PriorityHeap: + """A priority queue implemented as a binary heap. + + The heap invariant is that the impurity improvement of the parent record + is larger then the impurity improvement of the children. + + Attributes + ---------- + capacity : SIZE_t + The capacity of the heap + + heap_ptr : SIZE_t + The water mark of the heap; the heap grows from left to right in the + array ``heap_``. The following invariant holds ``heap_ptr < capacity``. + + heap_ : PriorityHeapRecord* + The array of heap records. The maximum element is on the left; + the heap grows from left to right + """ + + def __cinit__(self, SIZE_t capacity): + self.capacity = capacity + self.heap_ptr = 0 + self.heap_ = malloc(capacity * sizeof(PriorityHeapRecord)) + if self.heap_ == NULL: + raise MemoryError() + + def __dealloc__(self): + free(self.heap_) + cdef bint is_empty(self) nogil: return self.heap_ptr <= 0 @@ -274,7 +275,7 @@ cdef class PriorityHeap: heap[heap_ptr].improvement = improvement # Heapify up - self.heapify_up(heap, heap_ptr) + heapify_up(heap, heap_ptr) # Increase element count self.heap_ptr = heap_ptr + 1 @@ -296,7 +297,7 @@ cdef class PriorityHeap: # Restore heap invariant if heap_ptr > 1: - self.heapify_down(heap, 0, heap_ptr - 1) + heapify_down(heap, 0, heap_ptr - 1) self.heap_ptr = heap_ptr - 1 From 5e2cd1a232f0b30864993ea21532af823c98a76e Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 16:42:48 -0500 Subject: [PATCH 49/75] [ci skip] fix indentation --- sklearn/tree/_utils.pyx | 62 ++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 2c1b3fb2b79d1..21756f89f50af 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -173,37 +173,37 @@ cdef class Stack: # PriorityHeap data structure # ============================================================================= - cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil: - """Restore heap invariant parent.improvement > child.improvement from - ``pos`` upwards. """ - if pos == 0: - return - - cdef SIZE_t parent_pos = (pos - 1) / 2 - - if heap[parent_pos].improvement < heap[pos].improvement: - heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] - self.heapify_up(heap, parent_pos) - - cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, - SIZE_t heap_length) nogil: - """Restore heap invariant parent.improvement > children.improvement from - ``pos`` downwards. """ - cdef SIZE_t left_pos = 2 * (pos + 1) - 1 - cdef SIZE_t right_pos = 2 * (pos + 1) - cdef SIZE_t largest = pos - - if (left_pos < heap_length and - heap[left_pos].improvement > heap[largest].improvement): - largest = left_pos - - if (right_pos < heap_length and - heap[right_pos].improvement > heap[largest].improvement): - largest = right_pos - - if largest != pos: - heap[pos], heap[largest] = heap[largest], heap[pos] - self.heapify_down(heap, largest, heap_length) +cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil: + """Restore heap invariant parent.improvement > child.improvement from + ``pos`` upwards. """ + if pos == 0: + return + + cdef SIZE_t parent_pos = (pos - 1) / 2 + + if heap[parent_pos].improvement < heap[pos].improvement: + heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] + self.heapify_up(heap, parent_pos) + +cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, + SIZE_t heap_length) nogil: + """Restore heap invariant parent.improvement > children.improvement from + ``pos`` downwards. """ + cdef SIZE_t left_pos = 2 * (pos + 1) - 1 + cdef SIZE_t right_pos = 2 * (pos + 1) + cdef SIZE_t largest = pos + + if (left_pos < heap_length and + heap[left_pos].improvement > heap[largest].improvement): + largest = left_pos + + if (right_pos < heap_length and + heap[right_pos].improvement > heap[largest].improvement): + largest = right_pos + + if largest != pos: + heap[pos], heap[largest] = heap[largest], heap[pos] + self.heapify_down(heap, largest, heap_length) cdef class PriorityHeap: From 9f1b5fd13b406f4725cdfaff410f87a220f56eab Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 16:44:53 -0500 Subject: [PATCH 50/75] [ci skip] fix class-specific issues with heaps --- sklearn/tree/_utils.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 21756f89f50af..a327cb705b436 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -173,7 +173,7 @@ cdef class Stack: # PriorityHeap data structure # ============================================================================= -cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil: +cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil: """Restore heap invariant parent.improvement > child.improvement from ``pos`` upwards. """ if pos == 0: @@ -183,9 +183,9 @@ cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil: if heap[parent_pos].improvement < heap[pos].improvement: heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] - self.heapify_up(heap, parent_pos) + heapify_up(heap, parent_pos) -cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, +cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos, SIZE_t heap_length) nogil: """Restore heap invariant parent.improvement > children.improvement from ``pos`` downwards. """ @@ -203,7 +203,7 @@ cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, if largest != pos: heap[pos], heap[largest] = heap[largest], heap[pos] - self.heapify_down(heap, largest, heap_length) + heapify_down(heap, largest, heap_length) cdef class PriorityHeap: From 802e1fdd4f6abe2c0f73163dec51f2b0448115fe Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 16:45:44 -0500 Subject: [PATCH 51/75] [ci skip] restore a newline --- sklearn/tree/_utils.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index a327cb705b436..ee0aea55eb08b 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -185,6 +185,7 @@ cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil: heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] heapify_up(heap, parent_pos) + cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos, SIZE_t heap_length) nogil: """Restore heap invariant parent.improvement > children.improvement from From c0401a5e7fb64c701120964d0bf1c9084244b9d9 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 16:47:35 -0500 Subject: [PATCH 52/75] [ci skip] remove microchange to refactor later --- sklearn/tree/_criterion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index ed037dd3aa57d..bd29b9b4fcc11 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -960,7 +960,7 @@ cdef class MSE(RegressionCriterion): for k in range(self.n_outputs): impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0 - impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 + impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 impurity_left[0] /= self.n_outputs impurity_right[0] /= self.n_outputs From 0bfc2c32e0f584d51bce6e53f629fb60823b5047 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 17:13:00 -0500 Subject: [PATCH 53/75] reword a comment --- sklearn/tree/_criterion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index bd29b9b4fcc11..69da8258675fc 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1149,7 +1149,7 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): y_ik = y[i * self.y_stride + k] - # remove y_ik with weight w from right and add to left + # remove y_ik and its weight w from right and add to left ( right_child_heaps[k]).remove(y_ik, w) ( left_child_heaps[k]).push(y_ik, w) From 702bb6bb70a1fc78cb3b65e44148f2e48f8ecd38 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Sun, 17 Jul 2016 17:18:31 -0500 Subject: [PATCH 54/75] remove heapify methods from queue class --- sklearn/tree/_utils.pxd | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index fe803ba2fe779..f27bb436f1a36 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -104,9 +104,6 @@ cdef class PriorityHeap: double impurity, double impurity_left, double impurity_right) nogil cdef int pop(self, PriorityHeapRecord* res) nogil - cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil - cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, - SIZE_t heap_length) nogil # ============================================================================= # WeightedPQueue data structure From 327ea19625e849092538e9d57aed737302f73817 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 18 Jul 2016 05:42:48 -0500 Subject: [PATCH 55/75] doc: update docstrings for dt, rf, and et regressors --- sklearn/ensemble/forest.py | 12 ++++++++---- sklearn/tree/_criterion.pyx | 2 +- sklearn/tree/tree.py | 8 ++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index e26323f65bfee..f4680071488f0 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -947,8 +947,10 @@ class RandomForestRegressor(ForestRegressor): The number of trees in the forest. criterion : string, optional (default="mse") - The function to measure the quality of a split. The only supported - criterion is "mse" for the mean squared error. + The function to measure the quality of a split. Supported criteria + are "mse" for the mean squared error, which is equal to variance + reduction as feature selection criterion, and "mae" for the mean + absolute error. max_features : int, float, string or None, optional (default="auto") The number of features to consider when looking for the best split: @@ -1299,8 +1301,10 @@ class ExtraTreesRegressor(ForestRegressor): The number of trees in the forest. criterion : string, optional (default="mse") - The function to measure the quality of a split. The only supported - criterion is "mse" for the mean squared error. + The function to measure the quality of a split. Supported criteria + are "mse" for the mean squared error, which is equal to variance + reduction as feature selection criterion, and "mae" for the mean + absolute error. max_features : int, float, string or None, optional (default="auto") The number of features to consider when looking for the best split: diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 69da8258675fc..9971ac6fe2cd2 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -725,7 +725,7 @@ cdef class RegressionCriterion(Criterion): self.sum_left = calloc(n_outputs, sizeof(double)) self.sum_right = calloc(n_outputs, sizeof(double)) - if (self.sum_total == NULL or + if (self.sum_total == NULL or self.sum_left == NULL or self.sum_right == NULL): raise MemoryError() diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 4f8ebf9e960ed..deca4c7730754 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -783,10 +783,10 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): Parameters ---------- criterion : string, optional (default="mse") - The function to measure the quality of a split. Supported - criterions are "mse" for the mean squared error, which is - equal to variance reduction as feature selection criterion, - and "mae" for the mean absolute deviation. + The function to measure the quality of a split. Supported criteria + are "mse" for the mean squared error, which is equal to variance + reduction as feature selection criterion, and "mae" for the mean + absolute error. splitter : string, optional (default="best") The strategy used to choose the split at each node. Supported From 469274d4371bc43b4145b3ed84084df2e33e7039 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 18 Jul 2016 07:23:37 -0500 Subject: [PATCH 56/75] doc: revert incorrect spacing to shorten diff --- sklearn/tree/_criterion.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 9971ac6fe2cd2..0dbd837330427 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -725,7 +725,7 @@ cdef class RegressionCriterion(Criterion): self.sum_left = calloc(n_outputs, sizeof(double)) self.sum_right = calloc(n_outputs, sizeof(double)) - if (self.sum_total == NULL or + if (self.sum_total == NULL or self.sum_left == NULL or self.sum_right == NULL): raise MemoryError() @@ -1165,7 +1165,7 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): y_ik = y[i * self.y_stride + k] - # remove y_ik from left and add to right + # remove y_ik and its weight w from left and add to right ( left_child_heaps[k]).remove(y_ik, w) ( right_child_heaps[k]).push(y_ik, w) From 560f6fa1879ab1a8e401344f0e7209e93e2a4843 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 18 Jul 2016 16:09:37 -0500 Subject: [PATCH 57/75] convert get_median to return value directly --- sklearn/tree/_criterion.pyx | 6 +++--- sklearn/tree/_utils.pxd | 9 ++++++--- sklearn/tree/_utils.pyx | 29 +++++++++++++---------------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 0dbd837330427..14beaed255c7f 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1061,7 +1061,7 @@ cdef class MAE(RegressionCriterion): # calculate the node medians for k in range(self.n_outputs): - ( right_child_heaps[k]).get_median(&(self.node_medians[k])) + self.node_medians[k] = ( right_child_heaps[k]).get_median() # Reset to pos=start self.reset() @@ -1229,7 +1229,7 @@ cdef class MAE(RegressionCriterion): impurity_right[0] = 0.0 for k in range(self.n_outputs): - ( left_child_heaps[k]).get_median(&median) + median = ( left_child_heaps[k]).get_median() for p in range(start, pos): i = samples[p] @@ -1239,7 +1239,7 @@ cdef class MAE(RegressionCriterion): impurity_left[0] /= ((pos - start) * self.n_outputs) for k in range(self.n_outputs): - ( right_child_heaps[k]).get_median(&median) + median = ( right_child_heaps[k]).get_median() for p in range(pos, end): i = samples[p] diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index f27bb436f1a36..88d6378c0c29b 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -2,6 +2,7 @@ # Peter Prettenhofer # Arnaud Joly # Jacob Schreiber +# Nelson Liu # # License: BSD 3 clause @@ -143,8 +144,10 @@ cdef class WeightedMedianHeap: cdef SIZE_t size(self) nogil cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil - cdef int update_median_parameters_post_push(self, DOUBLE_t data, DOUBLE_t weight) nogil + cdef int update_median_parameters_post_push(self, DOUBLE_t data, + DOUBLE_t weight) nogil cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil - cdef int update_median_parameters_post_remove(self, DOUBLE_t data, DOUBLE_t weight) nogil - cdef int get_median(self, double* median) nogil + cdef int update_median_parameters_post_remove(self, DOUBLE_t data, + DOUBLE_t weight) nogil + cdef double get_median(self) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index ee0aea55eb08b..e4b93fa2f10bf 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -328,7 +328,8 @@ cdef class WeightedPQueue: def __cinit__(self, SIZE_t capacity): self.capacity = capacity self.array_ptr = 0 - self.array_ = calloc(capacity, sizeof(WeightedPQueueRecord)) + self.array_ = calloc(capacity, + sizeof(WeightedPQueueRecord)) if self.array_ == NULL: raise MemoryError() @@ -485,9 +486,8 @@ cdef class WeightedMedianHeap: cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil: """Push a value and its associated weight to the WeightedMedianHeap to be considered - in the median calculation + in the median calculation. """ - cdef double current_median cdef int return_value return_value = self.samples.push(data, weight) @@ -508,7 +508,7 @@ cdef class WeightedMedianHeap: return 0 # get the original weighted median - self.get_median(¤t_median) + current_median = self.get_median() self.total_weight += weight if data < current_median: @@ -521,8 +521,8 @@ cdef class WeightedMedianHeap: # minimize k such that sum(W[0:k]) >= total_weight / 2 # minimum value of k is 1 - while(self.k > 1 and (self.sum_w_0_k - - self.samples.get_weight_from_index(self.k-1) + while(self.k > 1 and ((self.sum_w_0_k - + self.samples.get_weight_from_index(self.k-1)) >= self.total_weight / 2.0)): self.k -= 1 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k) @@ -568,7 +568,7 @@ cdef class WeightedMedianHeap: DOUBLE_t weight) nogil: """Update the parameters used in the median calculation, namely `k` and `sum_w_0_k` after a removal""" - cdef DOUBLE_t current_median + cdef double current_median # reset parameters because empty if self.samples.size() == 0: self.k = 0 @@ -584,7 +584,7 @@ cdef class WeightedMedianHeap: return 0 # get the current weighted median - self.get_median(¤t_median) + current_median = self.get_median() self.total_weight -= weight if data < current_median: @@ -609,22 +609,19 @@ cdef class WeightedMedianHeap: # removing above the median # minimize k such that sum(W[0:k]) >= total_weight / 2 while(self.k > 1 and ((self.sum_w_0_k - - self.samples.get_weight_from_index(self.k-1)) + self.samples.get_weight_from_index(self.k-1)) >= self.total_weight / 2.0)): self.k -= 1 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k) return 0 - cdef int get_median(self, double* median) nogil: + cdef double get_median(self) nogil: """Write the median to a pointer, taking into account sample weights.""" - if self.sum_w_0_k < (self.total_weight / 2.0): - return -1 if self.sum_w_0_k == (self.total_weight / 2.0): # split median - median[0] = (self.samples.get_value_from_index(self.k) + - self.samples.get_value_from_index(self.k-1)) / 2.0 + return (self.samples.get_value_from_index(self.k) + + self.samples.get_value_from_index(self.k-1)) / 2.0 if self.sum_w_0_k > (self.total_weight / 2.0): # whole median - median[0] = self.samples.get_value_from_index(self.k-1) - return 0 + return self.samples.get_value_from_index(self.k-1) From 87b01807f6e03066b19df7dca37775d58cc8d0f3 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 18 Jul 2016 16:11:14 -0500 Subject: [PATCH 58/75] [ci skip] remove accidental whitespace --- sklearn/tree/_criterion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 14beaed255c7f..40fb73b4bdb69 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1305,5 +1305,5 @@ cdef class FriedmanMSE(MSE): diff = (self.weighted_n_right * total_sum_left - self.weighted_n_left * total_sum_right) / self.n_outputs - return (diff * diff / (self.weighted_n_left * self.weighted_n_right * + return (diff * diff / (self.weighted_n_left * self.weighted_n_right * self.weighted_n_node_samples)) From ecae6754fae1a919697696043a2dfa8269b8f477 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 18 Jul 2016 20:56:18 -0700 Subject: [PATCH 59/75] remove extraneous unpacking of values --- sklearn/tree/_utils.pyx | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index e4b93fa2f10bf..dec793b44da9e 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -431,9 +431,8 @@ cdef class WeightedPQueue: cdef int peek(self, DOUBLE_t* data, DOUBLE_t* weight) nogil: """Write the top element from array to a pointer. Returns 0 if successful, -1 if nothing to write.""" - cdef SIZE_t array_ptr = self.array_ptr cdef WeightedPQueueRecord* array = self.array_ - if array_ptr <= 0: + if self.array_ptr <= 0: return -1 # Take first value data[0] = array[0].data @@ -443,10 +442,8 @@ cdef class WeightedPQueue: cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil: """Given an index between [0,self.current_capacity], access the appropriate heap and return the requested weight""" - cdef SIZE_t array_ptr = self.array_ptr cdef WeightedPQueueRecord* array = self.array_ - - if array_ptr <= 0 or index >= array_ptr: + if self.array_ptr <= 0 or index >= self.array_ptr: with gil: raise ValueError("Tried to access element " "at index out of bounds.") From 6c2835862aba0f77f4cb8d83a10176e168d6217c Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Mon, 18 Jul 2016 21:02:54 -0700 Subject: [PATCH 60/75] style: misc changes to identifiers --- sklearn/tree/_criterion.pyx | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 40fb73b4bdb69..5be5700c45f93 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1035,7 +1035,7 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t k cdef DOUBLE_t y_ik cdef DOUBLE_t w = 1.0 - # Fill accumulators with MedianHeaps + with gil: for k in range(self.n_outputs): self.left_child_heaps[k] = WeightedMedianHeap(self.n_node_samples) @@ -1071,8 +1071,8 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i cdef SIZE_t k - cdef DOUBLE_t popped_value - cdef DOUBLE_t popped_weight + cdef DOUBLE_t value + cdef DOUBLE_t weight cdef void** left_child_heaps = self.left_child_heaps.data cdef void** right_child_heaps = self.right_child_heaps.data @@ -1088,10 +1088,10 @@ cdef class MAE(RegressionCriterion): # if left has no elements, it's already reset for i in range(( left_child_heaps[k]).size()): # remove everything from left and put it into right - ( left_child_heaps[k]).pop(&popped_value, - &popped_weight) - ( right_child_heaps[k]).push(popped_value, - popped_weight) + ( left_child_heaps[k]).pop(&value, + &weight) + ( right_child_heaps[k]).push(value, + weight) cdef void reverse_reset(self) nogil: """Reset the criterion at pos=end.""" @@ -1100,8 +1100,8 @@ cdef class MAE(RegressionCriterion): self.weighted_n_left = self.weighted_n_node_samples self.pos = self.end - cdef DOUBLE_t popped_value - cdef DOUBLE_t popped_weight + cdef DOUBLE_t value + cdef DOUBLE_t weight cdef void** left_child_heaps = self.left_child_heaps.data cdef void** right_child_heaps = self.right_child_heaps.data @@ -1111,10 +1111,10 @@ cdef class MAE(RegressionCriterion): # if right has no elements, it's already reset for i in range(( right_child_heaps[k]).size()): # remove everything from right and put it into left - ( right_child_heaps[k]).pop(&popped_value, - &popped_weight) - ( left_child_heaps[k]).push(popped_value, - popped_weight) + ( right_child_heaps[k]).pop(&value, + &weight) + ( left_child_heaps[k]).push(value, + weight) cdef void update(self, SIZE_t new_pos) nogil: """Updated statistics by moving samples[pos:new_pos] to the left.""" From 0db99659a216ec1e90099726edef5ace53da30a6 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 19 Jul 2016 11:46:04 -0700 Subject: [PATCH 61/75] add docstrings and more informative variable identifiers --- sklearn/tree/_criterion.pyx | 66 ++++++++++++++++++------------------- sklearn/tree/_utils.pxd | 4 +-- sklearn/tree/_utils.pyx | 55 +++++++++++++++++++++++++------ 3 files changed, 80 insertions(+), 45 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 5be5700c45f93..2834609cff17a 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -29,7 +29,7 @@ np.import_array() from ._utils cimport log from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray -from ._utils cimport WeightedMedianHeap +from ._utils cimport WeightedMedianCalculator cdef class Criterion: """Interface for impurity criteria. @@ -971,8 +971,8 @@ cdef class MAE(RegressionCriterion): """Destructor.""" free(self.node_medians) - cdef np.ndarray left_child_heaps - cdef np.ndarray right_child_heaps + cdef np.ndarray left_child + cdef np.ndarray right_child cdef double* node_medians def __cinit__(self, SIZE_t n_outputs): @@ -1010,8 +1010,8 @@ cdef class MAE(RegressionCriterion): if (self.node_medians == NULL): raise MemoryError() - self.left_child_heaps = np.empty(n_outputs, dtype='object') - self.right_child_heaps = np.empty(n_outputs, dtype='object') + self.left_child = np.empty(n_outputs, dtype='object') + self.right_child = np.empty(n_outputs, dtype='object') cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, @@ -1038,11 +1038,11 @@ cdef class MAE(RegressionCriterion): with gil: for k in range(self.n_outputs): - self.left_child_heaps[k] = WeightedMedianHeap(self.n_node_samples) - self.right_child_heaps[k] = WeightedMedianHeap(self.n_node_samples) + self.left_child[k] = WeightedMedianCalculator(self.n_node_samples) + self.right_child[k] = WeightedMedianCalculator(self.n_node_samples) - cdef void** left_child_heaps = self.left_child_heaps.data - cdef void** right_child_heaps = self.right_child_heaps.data + cdef void** left_child = self.left_child.data + cdef void** right_child = self.right_child.data for p in range(start, end): i = samples[p] @@ -1055,13 +1055,13 @@ cdef class MAE(RegressionCriterion): # push all values to the right side, # since pos = start initially anyway - ( right_child_heaps[k]).push(y_ik, w) + ( right_child[k]).push(y_ik, w) self.weighted_n_node_samples += w # calculate the node medians for k in range(self.n_outputs): - self.node_medians[k] = ( right_child_heaps[k]).get_median() + self.node_medians[k] = ( right_child[k]).get_median() # Reset to pos=start self.reset() @@ -1074,8 +1074,8 @@ cdef class MAE(RegressionCriterion): cdef DOUBLE_t value cdef DOUBLE_t weight - cdef void** left_child_heaps = self.left_child_heaps.data - cdef void** right_child_heaps = self.right_child_heaps.data + cdef void** left_child = self.left_child.data + cdef void** right_child = self.right_child.data self.weighted_n_left = 0.0 self.weighted_n_right = self.weighted_n_node_samples @@ -1086,11 +1086,11 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): # if left has no elements, it's already reset - for i in range(( left_child_heaps[k]).size()): + for i in range(( left_child[k]).size()): # remove everything from left and put it into right - ( left_child_heaps[k]).pop(&value, + ( left_child[k]).pop(&value, &weight) - ( right_child_heaps[k]).push(value, + ( right_child[k]).push(value, weight) cdef void reverse_reset(self) nogil: @@ -1102,18 +1102,18 @@ cdef class MAE(RegressionCriterion): cdef DOUBLE_t value cdef DOUBLE_t weight - cdef void** left_child_heaps = self.left_child_heaps.data - cdef void** right_child_heaps = self.right_child_heaps.data + cdef void** left_child = self.left_child.data + cdef void** right_child = self.right_child.data # reverse_reset the medianheaps, right should have no elements and # left should have all elements. for k in range(self.n_outputs): # if right has no elements, it's already reset - for i in range(( right_child_heaps[k]).size()): + for i in range(( right_child[k]).size()): # remove everything from right and put it into left - ( right_child_heaps[k]).pop(&value, + ( right_child[k]).pop(&value, &weight) - ( left_child_heaps[k]).push(value, + ( left_child[k]).push(value, weight) cdef void update(self, SIZE_t new_pos) nogil: @@ -1122,8 +1122,8 @@ cdef class MAE(RegressionCriterion): cdef double* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples - cdef void** left_child_heaps = self.left_child_heaps.data - cdef void** right_child_heaps = self.right_child_heaps.data + cdef void** left_child = self.left_child.data + cdef void** right_child = self.right_child.data cdef DOUBLE_t* y = self.y cdef SIZE_t pos = self.pos @@ -1136,7 +1136,7 @@ cdef class MAE(RegressionCriterion): # Update statistics up to new_pos # - # We are going to update right_child_heaps and left_child_heaps + # We are going to update right_child and left_child # from the direction that require the least amount of # computations, i.e. from pos to new_pos or from end to new_pos. @@ -1150,8 +1150,8 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): y_ik = y[i * self.y_stride + k] # remove y_ik and its weight w from right and add to left - ( right_child_heaps[k]).remove(y_ik, w) - ( left_child_heaps[k]).push(y_ik, w) + ( right_child[k]).remove(y_ik, w) + ( left_child[k]).push(y_ik, w) self.weighted_n_left += w else: @@ -1166,8 +1166,8 @@ cdef class MAE(RegressionCriterion): for k in range(self.n_outputs): y_ik = y[i * self.y_stride + k] # remove y_ik and its weight w from left and add to right - ( left_child_heaps[k]).remove(y_ik, w) - ( right_child_heaps[k]).push(y_ik, w) + ( left_child[k]).remove(y_ik, w) + ( right_child[k]).push(y_ik, w) self.weighted_n_left -= w @@ -1222,14 +1222,14 @@ cdef class MAE(RegressionCriterion): cdef DOUBLE_t y_ik cdef DOUBLE_t median - cdef void** left_child_heaps = self.left_child_heaps.data - cdef void** right_child_heaps = self.right_child_heaps.data + cdef void** left_child = self.left_child.data + cdef void** right_child = self.right_child.data impurity_left[0] = 0.0 impurity_right[0] = 0.0 for k in range(self.n_outputs): - median = ( left_child_heaps[k]).get_median() + median = ( left_child[k]).get_median() for p in range(start, pos): i = samples[p] @@ -1239,7 +1239,7 @@ cdef class MAE(RegressionCriterion): impurity_left[0] /= ((pos - start) * self.n_outputs) for k in range(self.n_outputs): - median = ( right_child_heaps[k]).get_median() + median = ( right_child[k]).get_median() for p in range(pos, end): i = samples[p] @@ -1305,5 +1305,5 @@ cdef class FriedmanMSE(MSE): diff = (self.weighted_n_right * total_sum_left - self.weighted_n_left * total_sum_right) / self.n_outputs - return (diff * diff / (self.weighted_n_left * self.weighted_n_right * + return (diff * diff / (self.weighted_n_left * self.weighted_n_right * self.weighted_n_node_samples)) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 88d6378c0c29b..b07e550789f89 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -131,10 +131,10 @@ cdef class WeightedPQueue: # ============================================================================= -# MedianHeap data structure +# WeightedMedianCalculator data structure # ============================================================================= -cdef class WeightedMedianHeap: +cdef class WeightedMedianCalculator: cdef SIZE_t initial_capacity cdef WeightedPQueue samples cdef DOUBLE_t total_weight diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index dec793b44da9e..40d6d19c5ca85 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -314,15 +314,17 @@ cdef class WeightedPQueue: Attributes ---------- capacity : SIZE_t - The capacity of the array + The capacity of the priority queue. array_ptr : SIZE_t - The water mark of the array; the array grows from left to right in the - array ``array_``. array_ptr is always less than capacity. + The water mark of the priority queue; the priority queue grows from + left to right in the array ``array_``. ``array_ptr`` is always + less than ``capacity``. array_ : WeightedPQueueRecord* - The array of array records. The minimum element is on the left; - the array grows from left to right + The array of priority queue records. The minimum element is on the + left at index 0, and the maximum element is on the right at index + ``array_ptr-1``. """ def __cinit__(self, SIZE_t capacity): @@ -464,10 +466,42 @@ cdef class WeightedPQueue: return array[index].data # ============================================================================= -# WeightedMedianHeap data structure +# WeightedMedianCalculator data structure # ============================================================================= -cdef class WeightedMedianHeap: +cdef class WeightedMedianCalculator: + """A class to handle calculation of the weighted median from streams of + data. To do so, it maintains a parameter ``k`` such that the sum of the + weights in the range [0,k) is greater than or equal to half of the total + weight. By minimizing the value of ``k`` that fulfills this constraint, + calculating the median is done by either taking the value of the sample + at index ``k-1`` of ``samples`` (samples[k-1].data) or the average of + the samples at index ``k-1`` and ``k`` of ``samples`` + ((samples[k-1] + samples[k]) / 2). + + Attributes + ---------- + initial_capacity : SIZE_t + The initial capacity of the WeightedMedianCalculator. + + samples : WeightedPQueue + Holds the samples (consisting of values and their weights) used in the + weighted median calculation. + + total_weight : DOUBLE_t + The sum of the weights of items in ``samples``. Represents the total + weight of all samples used in the median calculation. + + k : SIZE_t + Index used to calculate the median. + + sum_w_0_k : DOUBLE_t + The sum of the weights from samples[0:k]. Used in the weighted + median calculation; minimizing the value of ``k`` such that + ``sum_w_0_k`` >= ``total_weight / 2`` provides a mechanism for + calculating the median in constant time. + + """ def __cinit__(self, SIZE_t initial_capacity): self.initial_capacity = initial_capacity @@ -477,12 +511,13 @@ cdef class WeightedMedianHeap: self.sum_w_0_k = 0 cdef SIZE_t size(self) nogil: - """Return the number of samples in the WeightedMedianHeap""" + """Return the number of samples in the + WeightedMedianCalculator""" return self.samples.size() cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil: """Push a value and its associated weight - to the WeightedMedianHeap to be considered + to the WeightedMedianCalculator to be considered in the median calculation. """ cdef int return_value @@ -566,7 +601,7 @@ cdef class WeightedMedianHeap: """Update the parameters used in the median calculation, namely `k` and `sum_w_0_k` after a removal""" cdef double current_median - # reset parameters because empty + # reset parameters because it there are no elements if self.samples.size() == 0: self.k = 0 self.total_weight = 0 From e37341665f98540a33026bd45b49b19d5f4ac18d Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 19 Jul 2016 11:58:44 -0700 Subject: [PATCH 62/75] [ci skip] add trivial comments to recythonize --- sklearn/tree/_splitter.pyx | 2 +- sklearn/tree/_tree.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 0617508aab236..f44d5da2b9ede 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -14,7 +14,7 @@ # Jacob Schreiber # # License: BSD 3 clause - +# trivial comment for recythonize from ._criterion cimport Criterion from libc.stdlib cimport free diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f44320a7b47ae..8f052b2a69058 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -14,7 +14,7 @@ # Jacob Schreiber # # License: BSD 3 clause - +# trivial comment for recythonize from cpython cimport Py_INCREF, PyObject from libc.stdlib cimport free From 448bb6e518a02d45fbaab5cabdbfc639129b39c1 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 19 Jul 2016 11:59:06 -0700 Subject: [PATCH 63/75] remove trivial comments for recythonizing --- sklearn/tree/_splitter.pyx | 2 +- sklearn/tree/_tree.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index f44d5da2b9ede..0617508aab236 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -14,7 +14,7 @@ # Jacob Schreiber # # License: BSD 3 clause -# trivial comment for recythonize + from ._criterion cimport Criterion from libc.stdlib cimport free diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 8f052b2a69058..f44320a7b47ae 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -14,7 +14,7 @@ # Jacob Schreiber # # License: BSD 3 clause -# trivial comment for recythonize + from cpython cimport Py_INCREF, PyObject from libc.stdlib cimport free From c44f327076dfb6b68003fb1bf986b5ae09d70083 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 19 Jul 2016 12:09:13 -0700 Subject: [PATCH 64/75] force recythonization for real this time --- sklearn/tree/_splitter.pyx | 2 +- sklearn/tree/_tree.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 0617508aab236..61e40cc81f97b 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -14,7 +14,7 @@ # Jacob Schreiber # # License: BSD 3 clause - +# another trivial comment from ._criterion cimport Criterion from libc.stdlib cimport free diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f44320a7b47ae..e7bf20b9a1b70 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -14,7 +14,7 @@ # Jacob Schreiber # # License: BSD 3 clause - +# another trivial comment from cpython cimport Py_INCREF, PyObject from libc.stdlib cimport free From 8d442cf8c595f6368b47f15f5e4f63dd92d45111 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Tue, 19 Jul 2016 13:43:00 -0700 Subject: [PATCH 65/75] remove trivial comments for recythonization --- sklearn/tree/_splitter.pyx | 2 +- sklearn/tree/_tree.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 61e40cc81f97b..0617508aab236 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -14,7 +14,7 @@ # Jacob Schreiber # # License: BSD 3 clause -# another trivial comment + from ._criterion cimport Criterion from libc.stdlib cimport free diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index e7bf20b9a1b70..f44320a7b47ae 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -14,7 +14,7 @@ # Jacob Schreiber # # License: BSD 3 clause -# another trivial comment + from cpython cimport Py_INCREF, PyObject from libc.stdlib cimport free From a0085380be5aa73c046f8dfc30e037d813287fce Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 20 Jul 2016 08:38:33 -0700 Subject: [PATCH 66/75] rfc: harmonize arg. names and remove unnecessary checks --- sklearn/tree/_utils.pxd | 4 ++-- sklearn/tree/_utils.pyx | 14 +++----------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index b07e550789f89..45ce0f56acead 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -123,9 +123,9 @@ cdef class WeightedPQueue: cdef bint is_empty(self) nogil cdef SIZE_t size(self) nogil cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil - cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil + cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil - cdef int peek(self, DOUBLE_t* res, DOUBLE_t* weight) nogil + cdef int peek(self, DOUBLE_t* data, DOUBLE_t* weight) nogil cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 40d6d19c5ca85..10b2adcfab560 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -381,7 +381,7 @@ cdef class WeightedPQueue: self.array_ptr = array_ptr + 1 return 0 - cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil: + cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil: """Remove a specific value/weight record from the array. Returns 0 if successful, -1 if record not found.""" cdef SIZE_t array_ptr = self.array_ptr @@ -394,7 +394,7 @@ cdef class WeightedPQueue: # find element to remove for i in range(array_ptr): - if array[i].data == value and array[i].weight == weight: + if array[i].data == data and array[i].weight == weight: idx_to_remove = i break @@ -445,23 +445,15 @@ cdef class WeightedPQueue: """Given an index between [0,self.current_capacity], access the appropriate heap and return the requested weight""" cdef WeightedPQueueRecord* array = self.array_ - if self.array_ptr <= 0 or index >= self.array_ptr: - with gil: - raise ValueError("Tried to access element " - "at index out of bounds.") + # get weight at index return array[index].weight cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil: """Given an index between [0,self.current_capacity], access the appropriate heap and return the requested value""" - cdef SIZE_t array_ptr = self.array_ptr cdef WeightedPQueueRecord* array = self.array_ - if array_ptr <= 0 or index >= array_ptr: - with gil: - raise ValueError("Tried to access element " - "at index out of bounds.") # get value at index return array[index].data From 929153c8925354845af4fcfb878f92499face390 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 20 Jul 2016 09:03:10 -0700 Subject: [PATCH 67/75] convert allocations to safe_realloc --- sklearn/tree/_criterion.pyx | 10 +++++----- sklearn/tree/_utils.pxd | 2 ++ sklearn/tree/_utils.pyx | 4 +--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 2834609cff17a..a0fe837bf6d2f 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -973,7 +973,7 @@ cdef class MAE(RegressionCriterion): cdef np.ndarray left_child cdef np.ndarray right_child - cdef double* node_medians + cdef DOUBLE_t* node_medians def __cinit__(self, SIZE_t n_outputs): """Initialize parameters for this criterion. @@ -1005,7 +1005,7 @@ cdef class MAE(RegressionCriterion): self.node_medians = NULL # Allocate memory for the accumulators - self.node_medians = calloc(n_outputs, sizeof(double)) + safe_realloc(&self.node_medians, n_outputs) if (self.node_medians == NULL): raise MemoryError() @@ -1119,7 +1119,7 @@ cdef class MAE(RegressionCriterion): cdef void update(self, SIZE_t new_pos) nogil: """Updated statistics by moving samples[pos:new_pos] to the left.""" - cdef double* sample_weight = self.sample_weight + cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef void** left_child = self.left_child.data @@ -1180,7 +1180,7 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t k for k in range(self.n_outputs): - dest[k] = self.node_medians[k] + dest[k] = self.node_medians[k] cdef double node_impurity(self) nogil: """Evaluate the impurity of the current node, i.e. the impurity of @@ -1201,7 +1201,7 @@ cdef class MAE(RegressionCriterion): y_ik = y[i * self.y_stride + k] - impurity += fabs(( y_ik) - self.node_medians[k]) + impurity += fabs(( y_ik) - self.node_medians[k]) return impurity / (self.weighted_n_node_samples * self.n_outputs) cdef void children_impurity(self, double* impurity_left, diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 45ce0f56acead..1b2eb0e76d39f 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -34,6 +34,8 @@ ctypedef fused realloc_ptr: (DTYPE_t*) (SIZE_t*) (unsigned char*) + (WeightedPQueueRecord*) + (DOUBLE_t*) cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except * diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 10b2adcfab560..398ef897d1e76 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -13,7 +13,6 @@ from libc.stdlib cimport free from libc.stdlib cimport malloc -from libc.stdlib cimport calloc from libc.stdlib cimport realloc from libc.math cimport log as ln @@ -330,8 +329,7 @@ cdef class WeightedPQueue: def __cinit__(self, SIZE_t capacity): self.capacity = capacity self.array_ptr = 0 - self.array_ = calloc(capacity, - sizeof(WeightedPQueueRecord)) + safe_realloc(&self.array_, capacity) if self.array_ == NULL: raise MemoryError() From f383c94cdd8f2f5045e2b1a28ceba0a509da2d22 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 20 Jul 2016 13:10:13 -0700 Subject: [PATCH 68/75] fix bug in weighted case and add tests for MAE --- sklearn/tree/_criterion.pyx | 4 ++-- sklearn/tree/_utils.pxd | 6 +++-- sklearn/tree/_utils.pyx | 40 ++++++++++++++++++++------------- sklearn/tree/tests/test_tree.py | 13 +++++++++++ 4 files changed, 44 insertions(+), 19 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index a0fe837bf6d2f..723810775f789 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1236,7 +1236,7 @@ cdef class MAE(RegressionCriterion): y_ik = y[i * self.y_stride + k] impurity_left[0] += fabs(( y_ik) - median) - impurity_left[0] /= ((pos - start) * self.n_outputs) + impurity_left[0] /= ((self.weighted_n_left) * self.n_outputs) for k in range(self.n_outputs): median = ( right_child[k]).get_median() @@ -1246,7 +1246,7 @@ cdef class MAE(RegressionCriterion): y_ik = y[i * self.y_stride + k] impurity_right[0] += fabs(( y_ik) - median) - impurity_right[0] /= ((end - pos) * self.n_outputs) + impurity_right[0] /= ((self.weighted_n_right) * self.n_outputs) cdef class FriedmanMSE(MSE): diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 1b2eb0e76d39f..25c09783c73ba 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -147,9 +147,11 @@ cdef class WeightedMedianCalculator: cdef SIZE_t size(self) nogil cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil cdef int update_median_parameters_post_push(self, DOUBLE_t data, - DOUBLE_t weight) nogil + DOUBLE_t weight, + double original_median) nogil cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil cdef int update_median_parameters_post_remove(self, DOUBLE_t data, - DOUBLE_t weight) nogil + DOUBLE_t weight, + double original_median) nogil cdef double get_median(self) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 398ef897d1e76..0a98718ab9d20 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -511,16 +511,20 @@ cdef class WeightedMedianCalculator: in the median calculation. """ cdef int return_value + cdef double original_median + if self.size() != 0: + original_median = self.get_median() return_value = self.samples.push(data, weight) - self.update_median_parameters_post_push(data, weight) + self.update_median_parameters_post_push(data, weight, + original_median) return return_value cdef int update_median_parameters_post_push(self, DOUBLE_t data, - DOUBLE_t weight) nogil: + DOUBLE_t weight, + double original_median) nogil: """Update the parameters used in the median calculation, namely `k` and `sum_w_0_k` after an insertion""" - cdef double current_median # trivial case of one element. if self.size() == 1: @@ -530,10 +534,9 @@ cdef class WeightedMedianCalculator: return 0 # get the original weighted median - current_median = self.get_median() self.total_weight += weight - if data < current_median: + if data < original_median: # inserting below the median, so increment k and # then update self.sum_w_0_k accordingly by adding # the weight that was added. @@ -548,10 +551,9 @@ cdef class WeightedMedianCalculator: >= self.total_weight / 2.0)): self.k -= 1 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k) - return 0 - if data >= current_median: + if data >= original_median: # inserting above or at the median # minimize k such that sum(W[0:k]) >= total_weight / 2 while(self.k < self.samples.size() and @@ -564,11 +566,15 @@ cdef class WeightedMedianCalculator: """Remove a value from the MedianHeap, removing it from consideration in the median calculation """ - cdef double current_unweighted_median cdef int return_value + cdef double original_median + + if self.size() != 0: + original_median = self.get_median() return_value = self.samples.remove(data, weight) - self.update_median_parameters_post_remove(data, weight) + self.update_median_parameters_post_remove(data, weight, + original_median) return return_value cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil: @@ -576,6 +582,10 @@ cdef class WeightedMedianCalculator: left and moving to the right. """ cdef int return_value + cdef double original_median + + if self.size() != 0: + original_median = self.get_median() # no elements to pop if self.samples.size() == 0: @@ -583,14 +593,15 @@ cdef class WeightedMedianCalculator: return_value = self.samples.pop(data, weight) self.update_median_parameters_post_remove(data[0], - weight[0]) + weight[0], + original_median) return return_value cdef int update_median_parameters_post_remove(self, DOUBLE_t data, - DOUBLE_t weight) nogil: + DOUBLE_t weight, + double original_median) nogil: """Update the parameters used in the median calculation, namely `k` and `sum_w_0_k` after a removal""" - cdef double current_median # reset parameters because it there are no elements if self.samples.size() == 0: self.k = 0 @@ -606,10 +617,9 @@ cdef class WeightedMedianCalculator: return 0 # get the current weighted median - current_median = self.get_median() self.total_weight -= weight - if data < current_median: + if data < original_median: # removing below the median, so decrement k and # then update self.sum_w_0_k accordingly by subtracting # the removed weight @@ -627,7 +637,7 @@ cdef class WeightedMedianCalculator: self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1) return 0 - if data >= current_median: + if data >= original_median: # removing above the median # minimize k such that sum(W[0:k]) >= total_weight / 2 while(self.k > 1 and ((self.sum_w_0_k - diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 0053155f8622f..78a35fe5becc1 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -1443,3 +1443,16 @@ def test_no_sparse_y_support(): # Currently we don't support sparse y for name in ALL_TREES: yield (check_no_sparse_y_support, name) + +def test_mae(): + # check MAE criterion produces correct results + # on small toy dataset + dt_mae = DecisionTreeRegressor(random_state=0, criterion="mae", + max_leaf_nodes=2) + dt_mae.fit([[3],[5],[3],[8],[5]],[6,7,3,4,3]) + assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0/3.0]) + assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0]) + + dt_mae.fit([[3],[5],[3],[8],[5]],[6,7,3,4,3], [0.6,0.3,0.1,1.0,0.3]) + assert_array_equal(dt_mae.tree_.impurity, [7.0/2.3, 3.0/0.7, 4.0/1.6]) + assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0]) From 6a1f3d4b9820449d9f3bce709c0cb376d865a5cf Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 20 Jul 2016 13:50:55 -0700 Subject: [PATCH 69/75] change all medians to DOUBLE_t --- sklearn/tree/_criterion.pyx | 9 ++++++--- sklearn/tree/_utils.pxd | 6 +++--- sklearn/tree/_utils.pyx | 8 ++++---- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 723810775f789..23a2c206f24ef 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1235,7 +1235,8 @@ cdef class MAE(RegressionCriterion): y_ik = y[i * self.y_stride + k] - impurity_left[0] += fabs(( y_ik) - median) + impurity_left[0] += fabs(( y_ik) - + median) impurity_left[0] /= ((self.weighted_n_left) * self.n_outputs) for k in range(self.n_outputs): @@ -1245,8 +1246,10 @@ cdef class MAE(RegressionCriterion): y_ik = y[i * self.y_stride + k] - impurity_right[0] += fabs(( y_ik) - median) - impurity_right[0] /= ((self.weighted_n_right) * self.n_outputs) + impurity_right[0] += fabs(( y_ik) - + median) + impurity_right[0] /= ((self.weighted_n_right) * + self.n_outputs) cdef class FriedmanMSE(MSE): diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 25c09783c73ba..883f454514008 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -148,10 +148,10 @@ cdef class WeightedMedianCalculator: cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil cdef int update_median_parameters_post_push(self, DOUBLE_t data, DOUBLE_t weight, - double original_median) nogil + DOUBLE_t original_median) nogil cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil cdef int update_median_parameters_post_remove(self, DOUBLE_t data, DOUBLE_t weight, - double original_median) nogil - cdef double get_median(self) nogil + DOUBLE_t original_median) nogil + cdef DOUBLE_t get_median(self) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 0a98718ab9d20..84c562d198afa 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -511,7 +511,7 @@ cdef class WeightedMedianCalculator: in the median calculation. """ cdef int return_value - cdef double original_median + cdef DOUBLE_t original_median if self.size() != 0: original_median = self.get_median() @@ -522,7 +522,7 @@ cdef class WeightedMedianCalculator: cdef int update_median_parameters_post_push(self, DOUBLE_t data, DOUBLE_t weight, - double original_median) nogil: + DOUBLE_t original_median) nogil: """Update the parameters used in the median calculation, namely `k` and `sum_w_0_k` after an insertion""" @@ -567,7 +567,7 @@ cdef class WeightedMedianCalculator: from consideration in the median calculation """ cdef int return_value - cdef double original_median + cdef DOUBLE_t original_median if self.size() != 0: original_median = self.get_median() @@ -647,7 +647,7 @@ cdef class WeightedMedianCalculator: self.sum_w_0_k -= self.samples.get_weight_from_index(self.k) return 0 - cdef double get_median(self) nogil: + cdef DOUBLE_t get_median(self) nogil: """Write the median to a pointer, taking into account sample weights.""" if self.sum_w_0_k == (self.total_weight / 2.0): From e25a52cff6964f83df946bdd5a577210851c141e Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 20 Jul 2016 16:43:34 -0700 Subject: [PATCH 70/75] add loginc allocate mediancalculators once, and reset otherwise --- sklearn/tree/_criterion.pyx | 37 +++++++++++++++++++++++++------------ sklearn/tree/_utils.pxd | 2 ++ sklearn/tree/_utils.pyx | 14 ++++++++++++++ 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 23a2c206f24ef..3ad0acda77bdc 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1018,6 +1018,15 @@ cdef class MAE(RegressionCriterion): SIZE_t end) nogil: """Initialize the criterion at node samples[start:end] and children samples[start:start] and samples[start:end].""" + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t y_ik + cdef DOUBLE_t w = 1.0 + cdef bint init_med_calculators + + if self.n_node_samples == 0: + init_med_calculators = 0 # Initialize fields self.y = y @@ -1030,20 +1039,25 @@ cdef class MAE(RegressionCriterion): self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0. - cdef SIZE_t i - cdef SIZE_t p - cdef SIZE_t k - cdef DOUBLE_t y_ik - cdef DOUBLE_t w = 1.0 + cdef void** left_child + cdef void** right_child - with gil: + # initialize WeightedMedianCalculators + if init_med_calculators == 0: + with gil: + for k in range(self.n_outputs): + self.left_child[k] = WeightedMedianCalculator(self.n_node_samples) + self.right_child[k] = WeightedMedianCalculator(self.n_node_samples) + # already initialized, so reset WeightedMedianCalculators + else: + left_child = self.left_child.data + right_child = self.right_child.data for k in range(self.n_outputs): - self.left_child[k] = WeightedMedianCalculator(self.n_node_samples) - self.right_child[k] = WeightedMedianCalculator(self.n_node_samples) - - cdef void** left_child = self.left_child.data - cdef void** right_child = self.right_child.data + ( left_child[k]).reset() + ( right_child[k]).reset() + left_child = self.left_child.data + right_child = self.right_child.data for p in range(start, end): i = samples[p] @@ -1058,7 +1072,6 @@ cdef class MAE(RegressionCriterion): ( right_child[k]).push(y_ik, w) self.weighted_n_node_samples += w - # calculate the node medians for k in range(self.n_outputs): self.node_medians[k] = ( right_child[k]).get_median() diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 883f454514008..d11880908c318 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -123,6 +123,7 @@ cdef class WeightedPQueue: cdef WeightedPQueueRecord* array_ cdef bint is_empty(self) nogil + cdef void reset(self) nogil cdef SIZE_t size(self) nogil cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil @@ -146,6 +147,7 @@ cdef class WeightedMedianCalculator: cdef SIZE_t size(self) nogil cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil + cdef void reset(self) nogil cdef int update_median_parameters_post_push(self, DOUBLE_t data, DOUBLE_t weight, DOUBLE_t original_median) nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 84c562d198afa..9377cfa616e16 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -13,6 +13,7 @@ from libc.stdlib cimport free from libc.stdlib cimport malloc +from libc.stdlib cimport calloc from libc.stdlib cimport realloc from libc.math cimport log as ln @@ -337,6 +338,12 @@ cdef class WeightedPQueue: def __dealloc__(self): free(self.array_) + cdef void reset(self) nogil: + """Reset the WeightedPQueue to its state at construction""" + self.array_ptr = 0 + self.array_ = calloc(self.capacity, + sizeof(WeightedPQueueRecord)) + cdef bint is_empty(self) nogil: return self.array_ptr <= 0 @@ -505,6 +512,13 @@ cdef class WeightedMedianCalculator: WeightedMedianCalculator""" return self.samples.size() + cdef void reset(self) nogil: + """Reset the WeightedMedianCalculator to its state at construction""" + self.samples.reset() + self.total_weight = 0 + self.k = 0 + self.sum_w_0_k = 0 + cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil: """Push a value and its associated weight to the WeightedMedianCalculator to be considered From bd0c71dc8441d9f88f943807558a51408626beca Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Wed, 20 Jul 2016 20:38:36 -0700 Subject: [PATCH 71/75] misc style fixes --- sklearn/tree/_criterion.pyx | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 3ad0acda77bdc..47736ac31221c 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1018,9 +1018,8 @@ cdef class MAE(RegressionCriterion): SIZE_t end) nogil: """Initialize the criterion at node samples[start:end] and children samples[start:start] and samples[start:end].""" - cdef SIZE_t i - cdef SIZE_t p - cdef SIZE_t k + + cdef SIZE_t i, p, k cdef DOUBLE_t y_ik cdef DOUBLE_t w = 1.0 cdef bint init_med_calculators @@ -1082,8 +1081,7 @@ cdef class MAE(RegressionCriterion): cdef void reset(self) nogil: """Reset the criterion at pos=start.""" - cdef SIZE_t i - cdef SIZE_t k + cdef SIZE_t i, k cdef DOUBLE_t value cdef DOUBLE_t weight @@ -1094,8 +1092,8 @@ cdef class MAE(RegressionCriterion): self.weighted_n_right = self.weighted_n_node_samples self.pos = self.start - # reset the medianheaps, left should have no elements and - # right should have all elements. + # reset the WeightedMedianCalculators, left should have no + # elements and right should have all elements. for k in range(self.n_outputs): # if left has no elements, it's already reset @@ -1118,8 +1116,8 @@ cdef class MAE(RegressionCriterion): cdef void** left_child = self.left_child.data cdef void** right_child = self.right_child.data - # reverse_reset the medianheaps, right should have no elements and - # left should have all elements. + # reverse reset the WeightedMedianCalculators, right should have no + # elements and left should have all elements. for k in range(self.n_outputs): # if right has no elements, it's already reset for i in range(( right_child[k]).size()): @@ -1141,9 +1139,7 @@ cdef class MAE(RegressionCriterion): cdef DOUBLE_t* y = self.y cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end - cdef SIZE_t i - cdef SIZE_t p - cdef SIZE_t k + cdef SIZE_t i, p, k cdef DOUBLE_t w = 1.0 cdef DOUBLE_t y_ik @@ -1223,6 +1219,7 @@ cdef class MAE(RegressionCriterion): left child (samples[start:pos]) and the impurity the right child (samples[pos:end]). """ + cdef DOUBLE_t* y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples From d3245ae9e842f6d0d48c1becab2f506877cc05e2 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 21 Jul 2016 08:35:50 -0700 Subject: [PATCH 72/75] modify cinit of regressioncriterion to take n_samples --- sklearn/tree/_criterion.pyx | 37 +++++++++++++++++-------------------- sklearn/tree/tree.py | 3 ++- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 47736ac31221c..71b9085675eda 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -687,13 +687,16 @@ cdef class RegressionCriterion(Criterion): cdef double sq_sum_total - def __cinit__(self, SIZE_t n_outputs): + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. Parameters ---------- n_outputs: SIZE_t The number of targets to be predicted + + n_samples: SIZE_t + The total number of samples to fit on """ # Default values @@ -975,13 +978,16 @@ cdef class MAE(RegressionCriterion): cdef np.ndarray right_child cdef DOUBLE_t* node_medians - def __cinit__(self, SIZE_t n_outputs): + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. Parameters ---------- n_outputs: SIZE_t The number of targets to be predicted + + n_samples: SIZE_t + The total number of samples to fit on """ # Default values @@ -1012,6 +1018,10 @@ cdef class MAE(RegressionCriterion): self.left_child = np.empty(n_outputs, dtype='object') self.right_child = np.empty(n_outputs, dtype='object') + # initialize WeightedMedianCalculators + for k in range(n_outputs): + self.left_child[k] = WeightedMedianCalculator(n_samples) + self.right_child[k] = WeightedMedianCalculator(n_samples) cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, @@ -1022,10 +1032,6 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t i, p, k cdef DOUBLE_t y_ik cdef DOUBLE_t w = 1.0 - cdef bint init_med_calculators - - if self.n_node_samples == 0: - init_med_calculators = 0 # Initialize fields self.y = y @@ -1041,22 +1047,13 @@ cdef class MAE(RegressionCriterion): cdef void** left_child cdef void** right_child - # initialize WeightedMedianCalculators - if init_med_calculators == 0: - with gil: - for k in range(self.n_outputs): - self.left_child[k] = WeightedMedianCalculator(self.n_node_samples) - self.right_child[k] = WeightedMedianCalculator(self.n_node_samples) - # already initialized, so reset WeightedMedianCalculators - else: - left_child = self.left_child.data - right_child = self.right_child.data - for k in range(self.n_outputs): - ( left_child[k]).reset() - ( right_child[k]).reset() - left_child = self.left_child.data right_child = self.right_child.data + + for k in range(self.n_outputs): + ( left_child[k]).reset() + ( right_child[k]).reset() + for p in range(start, end): i = samples[p] diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index deca4c7730754..f004d845279bc 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -338,7 +338,8 @@ def fit(self, X, y, sample_weight=None, check_input=True, criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) else: - criterion = CRITERIA_REG[self.criterion](self.n_outputs_) + criterion = CRITERIA_REG[self.criterion](self.n_outputs_, + n_samples) SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS From dbaa57bf873ba0f680f67f802aeedd83247251f6 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 21 Jul 2016 09:25:22 -0700 Subject: [PATCH 73/75] add MAE formula and force rebuild bc. travis was down --- sklearn/tree/_criterion.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 71b9085675eda..7e2b6a3a80e9e 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -969,7 +969,10 @@ cdef class MSE(RegressionCriterion): impurity_right[0] /= self.n_outputs cdef class MAE(RegressionCriterion): - """Mean absolute error impurity criterion""" + """Mean absolute error impurity criterion + + MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true + value and f_i is the predicted value.""" def __dealloc__(self): """Destructor.""" free(self.node_medians) From f668ab9aca168759a761195db5e7ffe03ad34609 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 21 Jul 2016 13:12:47 -0700 Subject: [PATCH 74/75] add criterion parameter to gradient boosting and add forest tests --- sklearn/ensemble/gradient_boosting.py | 31 +++++++++++++++++++++------ sklearn/ensemble/tests/test_forest.py | 4 ++-- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index b17d726cb122a..1b0767d419168 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -720,8 +720,8 @@ class BaseGradientBoosting(six.with_metaclass(ABCMeta, BaseEnsemble, """Abstract base class for Gradient Boosting. """ @abstractmethod - def __init__(self, loss, learning_rate, n_estimators, min_samples_split, - min_samples_leaf, min_weight_fraction_leaf, + def __init__(self, loss, learning_rate, n_estimators, criterion, + min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, init, subsample, max_features, random_state, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto'): @@ -729,6 +729,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split, self.n_estimators = n_estimators self.learning_rate = learning_rate self.loss = loss + self.criterion = criterion self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf @@ -762,7 +763,7 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, # induce regression tree on residuals tree = DecisionTreeRegressor( - criterion='friedman_mse', + criterion=self.criterion, splitter='best', max_depth=self.max_depth, min_samples_split=self.min_samples_split, @@ -1296,6 +1297,14 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): of the input variables. Ignored if ``max_leaf_nodes`` is not None. + criterion : string, optional (default="friedman_mse") + The function to measure the quality of a split. Supported criteria + are "friedman_mse" for the mean squared error with improvement + score by Friedman, "mse" for mean squared error, and "mae" for + the mean absolute error. The default value of "friedman_mse" is + generally the best as it can provide a better approximation in + some cases. + min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: @@ -1426,7 +1435,7 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin): _SUPPORTED_LOSS = ('deviance', 'exponential') def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, - subsample=1.0, min_samples_split=2, + subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, init=None, random_state=None, max_features=None, verbose=0, @@ -1435,7 +1444,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, super(GradientBoostingClassifier, self).__init__( loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, - min_samples_split=min_samples_split, + criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, init=init, subsample=subsample, @@ -1643,6 +1652,14 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): of the input variables. Ignored if ``max_leaf_nodes`` is not None. + criterion : string, optional (default="friedman_mse") + The function to measure the quality of a split. Supported criteria + are "friedman_mse" for the mean squared error with improvement + score by Friedman, "mse" for mean squared error, and "mae" for + the mean absolute error. The default value of "friedman_mse" is + generally the best as it can provide a better approximation in + some cases. + min_samples_split : int, float, optional (default=2) The minimum number of samples required to split an internal node: @@ -1772,7 +1789,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin): _SUPPORTED_LOSS = ('ls', 'lad', 'huber', 'quantile') def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, - subsample=1.0, min_samples_split=2, + subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, @@ -1780,7 +1797,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100, super(GradientBoostingRegressor, self).__init__( loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, - min_samples_split=min_samples_split, + criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, init=init, subsample=subsample, diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index ce3642c5cfe21..489ba40689d38 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -159,7 +159,7 @@ def check_boston_criterion(name, criterion): def test_boston(): - for name, criterion in product(FOREST_REGRESSORS, ("mse", )): + for name, criterion in product(FOREST_REGRESSORS, ("mse", "mae", "friedman_mse")): yield check_boston_criterion, name, criterion @@ -244,7 +244,7 @@ def test_importances(): for name, criterion in product(FOREST_CLASSIFIERS, ["gini", "entropy"]): yield check_importances, name, criterion, X, y - for name, criterion in product(FOREST_REGRESSORS, ["mse", "friedman_mse"]): + for name, criterion in product(FOREST_REGRESSORS, ["mse", "friedman_mse", "mae"]): yield check_importances, name, criterion, X, y From 04d3b8add4f407101a2add2621d55f09dc6c5e14 Mon Sep 17 00:00:00 2001 From: Nelson Liu Date: Thu, 21 Jul 2016 13:51:30 -0700 Subject: [PATCH 75/75] add entries to what's new --- doc/whats_new.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 3a3ddf932a828..dd339bcabb6da 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -117,6 +117,14 @@ New features and Harabaz score to evaluate the resulting clustering of a set of points. By `Arnaud Fouchet`_ and `Thierry Guillemot`_. + - Added a new splitting criterion for :class:`tree.DecisionTreeRegressor`, + the mean absolute error. This criterion can also be used in + :class:`ensemble.ExtraTreesRegressor`, + :class:`ensemble.RandomForestRegressor`, and the gradient boosting + estimators. (`#6667 + `_) by `Nelson + Liu`_. + Enhancements ............ @@ -146,6 +154,11 @@ Enhancements provided as a percentage of the training samples. By `yelite`_ and `Arnaud Joly`_. + - Gradient boosting estimators accept the parameter ``criterion`` to specify + to splitting criterion used in built decision trees. (`#6667 + `_) by `Nelson + Liu`_. + - Codebase does not contain C/C++ cython generated files: they are generated during build. Distribution packages will still contain generated C/C++ files. By `Arthur Mensch`_. @@ -4280,3 +4293,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Sebastian Säger: https://github.com/ssaeger .. _YenChen Lin: https://github.com/yenchenlin + +.. _Nelson Liu: https://github.com/nelson-liu