From 783f433df26099ed7d0ba4178d7589bfe49df0d3 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Fri, 15 Apr 2016 00:00:02 -0700
Subject: [PATCH 01/75] feature: add initial node_value method

---
 sklearn/tree/_criterion.pyx     | 73 +++++++++++++++++++++++++++++++++
 sklearn/tree/tests/test_tree.py |  2 +-
 sklearn/tree/tree.py            | 10 +++--
 3 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 61fde29defe8d..ecc3e2d924727 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -19,6 +19,7 @@ from libc.stdlib cimport calloc
 from libc.stdlib cimport free
 from libc.string cimport memcpy
 from libc.string cimport memset
+from libc.math cimport fabs
 
 import numpy as np
 cimport numpy as np
@@ -962,6 +963,78 @@ cdef class MSE(RegressionCriterion):
         impurity_left[0] /= self.n_outputs
         impurity_right[0] /= self.n_outputs
 
+cdef class MAE(RegressionCriterion):
+    """Mean absolute error impurity criterion
+    """
+    cdef void node_value(self, double* dest) nogil:
+        """Computes the node value of samples[start:end] into dest."""
+        cdef double* sample_weight = self.sample_weight
+        cdef SIZE_t* samples = self.samples
+        
+        cdef DOUBLE_t* y = self.y
+        cdef SIZE_t start = self.start
+        cdef SIZE_t end = self.end
+        cdef SIZE_t i
+        cdef SIZE_t p
+        cdef SIZE_t k
+        cdef DOUBLE_t w = 1.0
+        cdef DOUBLE_t y_ik
+
+        cdef DOUBLE_t sum_weights = 0
+        cdef SIZE_t median_index = 0
+        cdef DOUBLE_t sum
+
+        
+        cdef DOUBLE_t* y_vals
+        cdef DOUBLE_t* weights
+        for k in range(self.n_outputs):
+            for p in range(start,end):
+                i = samples[p]
+
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                y_ik = y[i * self.y_stride + k]
+                y_vals[p] = y_ik
+                weights[p] = w
+
+            # calculate weighted median
+            for p in range(start, end):
+                sum_weights += weights[p]
+            sum = sum_weights - weights[0]
+
+            while(sum > (sum_weights/2)):
+                median_index +=1
+                sum -= weights[median_index]
+            dest[k] = samples[median_index]
+
+    cdef double node_impurity(self) nogil:
+        """Evaluate the impurity of the current node, i.e. the impurity of 
+           samples[start:end]"""
+        # todo
+        pass
+
+    cdef double proxy_impurity_improvement(self) nogil:
+        """Compute a proxy of the impurity reduction
+        This method is used to speed up the search for the best split.
+        It is a proxy quantity such that the split that maximizes this value
+        also maximizes the impurity improvement. It neglects all constant terms
+        of the impurity decrease for a given split.
+        The absolute impurity improvement is only computed by the
+        impurity_improvement method once the best split has been found.
+        """
+
+        # todo
+        pass
+
+    cdef void children_impurity(self, double* impurity_left,
+                                double* impurity_right) nogil:
+        """Evaluate the impurity in children nodes, i.e. the impurity of the
+           left child (samples[start:pos]) and the impurity the right child
+           (samples[pos:end]).
+        """
+        # todo
+        pass
 
 cdef class FriedmanMSE(MSE):
     """Mean squared error impurity criterion with improvement score by Friedman
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index e4ca2be5e452a..0053155f8622f 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -48,7 +48,7 @@
 from sklearn.utils import compute_sample_weight
 
 CLF_CRITERIONS = ("gini", "entropy")
-REG_CRITERIONS = ("mse", )
+REG_CRITERIONS = ("mse", "mae")
 
 CLF_TREES = {
     "DecisionTreeClassifier": DecisionTreeClassifier,
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 81fcf7f442ac2..658b9c46b298d 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -56,7 +56,8 @@
 DOUBLE = _tree.DOUBLE
 
 CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy}
-CRITERIA_REG = {"mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE}
+CRITERIA_REG = {"mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE,
+                "mae": _criterion_MAE}
 
 DENSE_SPLITTERS = {"best": _splitter.BestSplitter,
                    "random": _splitter.RandomSplitter}
@@ -782,9 +783,10 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
     Parameters
     ----------
     criterion : string, optional (default="mse")
-        The function to measure the quality of a split. The only supported
-        criterion is "mse" for the mean squared error, which is equal to
-        variance reduction as feature selection criterion.
+        The function to measure the quality of a split. Supported
+        criterions are "mse" for the mean squared error, which is 
+        equal to variance reduction as feature selection criterion, 
+        and "mae" for the mean absolute deviation.
 
     splitter : string, optional (default="best")
         The strategy used to choose the split at each node. Supported

From 68ae519880b71188f8dbb788a121b6e67fabc7f0 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Apr 2016 13:24:45 -0700
Subject: [PATCH 02/75] testing code for node_impurity and node_value

This code runs into 'Bus Error: 10' at node_value final assignment.
---
 sklearn/tree/_criterion.pyx | 60 +++++++++++++++++++++++++++----------
 sklearn/tree/tree.py        |  2 +-
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index ecc3e2d924727..8f26a1cb847d7 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -968,6 +968,8 @@ cdef class MAE(RegressionCriterion):
     """
     cdef void node_value(self, double* dest) nogil:
         """Computes the node value of samples[start:end] into dest."""
+        with gil:
+            print "entered node_value"
         cdef double* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
         
@@ -980,13 +982,16 @@ cdef class MAE(RegressionCriterion):
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t y_ik
 
-        cdef DOUBLE_t sum_weights = 0
+        cdef DOUBLE_t sum_weights = 0.0
         cdef SIZE_t median_index = 0
-        cdef DOUBLE_t sum
-
-        
-        cdef DOUBLE_t* y_vals
-        cdef DOUBLE_t* weights
+        cdef DOUBLE_t Sum
+
+        y_vals = NULL
+        weights = NULL
+        y_vals = <double*> calloc(self.n_node_samples, sizeof(double))
+        weights = <double*> calloc(self.n_node_samples, sizeof(double))
+        cdef double* y_val_pointer = y_vals
+        cdef double* weight_pointer = weights
         for k in range(self.n_outputs):
             for p in range(start,end):
                 i = samples[p]
@@ -995,24 +1000,49 @@ cdef class MAE(RegressionCriterion):
                     w = sample_weight[i]
 
                 y_ik = y[i * self.y_stride + k]
-                y_vals[p] = y_ik
-                weights[p] = w
+                
+                y_val_pointer[p] = y_ik
+                weight_pointer[p] = w
 
-            # calculate weighted median
             for p in range(start, end):
-                sum_weights += weights[p]
-            sum = sum_weights - weights[0]
+                sum_weights += weight_pointer[p]
+                
+            Sum = sum_weights - weight_pointer[0]
 
-            while(sum > (sum_weights/2)):
+            while(Sum > sum_weights/2):
                 median_index +=1
-                sum -= weights[median_index]
-            dest[k] = samples[median_index]
+                Sum -= weight_pointer[median_index]
+
+            with gil:
+                print "calculated weighted median:"
+                print y_val_pointer[median_index]
+            dest[k] = y_val_pointer[median_index]
+            with gil:
+                print "normally this isn't printed because of bus error: 10"
 
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of 
            samples[start:end]"""
+        with gil:
+            print "Entered node_impurity function"
+        cdef double* medians
+        cdef double impurity = 0.0
+        cdef SIZE_t* samples = self.samples
+        cdef SIZE_t k
+        cdef SIZE_t p
+        cdef SIZE_t i
+        cdef DOUBLE_t y_ik
+        self.node_value(medians)
+        with gil:
+            print "exited node_value"
+        for k in range(self.n_outputs):
+            for p in range(self.start, self.end):
+                i = samples[p]
+                y_ik = self.y[i * self.y_stride + k]
+
+                impurity += fabs(y_ik - medians[k]) / self.n_node_samples
+        return impurity / self.n_outputs
         # todo
-        pass
 
     cdef double proxy_impurity_improvement(self) nogil:
         """Compute a proxy of the impurity reduction
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 658b9c46b298d..4f8ebf9e960ed 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -57,7 +57,7 @@
 
 CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy}
 CRITERIA_REG = {"mse": _criterion.MSE, "friedman_mse": _criterion.FriedmanMSE,
-                "mae": _criterion_MAE}
+                "mae": _criterion.MAE}
 
 DENSE_SPLITTERS = {"best": _splitter.BestSplitter,
                    "random": _splitter.RandomSplitter}

From c7b640aac7f31c98b28f1945da7a0410ce347338 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Apr 2016 20:09:27 -0700
Subject: [PATCH 03/75] fix: node_value now correctly calculating weighted
 median for sorted data.

Still need to change the code to work with unsorted data.
---
 sklearn/tree/_criterion.pyx | 54 +++++++++++++------------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 8f26a1cb847d7..79d65487536ca 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -968,8 +968,6 @@ cdef class MAE(RegressionCriterion):
     """
     cdef void node_value(self, double* dest) nogil:
         """Computes the node value of samples[start:end] into dest."""
-        with gil:
-            print "entered node_value"
         cdef double* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
         
@@ -982,50 +980,40 @@ cdef class MAE(RegressionCriterion):
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t y_ik
 
-        cdef DOUBLE_t sum_weights = 0.0
-        cdef SIZE_t median_index = 0
-        cdef DOUBLE_t Sum
+        cdef DOUBLE_t sum_weights
+        cdef SIZE_t median_index
+        cdef DOUBLE_t sum
 
-        y_vals = NULL
-        weights = NULL
-        y_vals = <double*> calloc(self.n_node_samples, sizeof(double))
-        weights = <double*> calloc(self.n_node_samples, sizeof(double))
-        cdef double* y_val_pointer = y_vals
-        cdef double* weight_pointer = weights
+        cdef double* y_vals = <double*> calloc(self.n_node_samples, sizeof(double))
+        cdef double* weights = <double*> calloc(self.n_node_samples, sizeof(double))
         for k in range(self.n_outputs):
+            median_index = 0
+            sum_weights = 0.0
             for p in range(start,end):
                 i = samples[p]
-
+                
+                y_ik = y[i * self.y_stride + k]
                 if sample_weight != NULL:
                     w = sample_weight[i]
 
-                y_ik = y[i * self.y_stride + k]
-                
-                y_val_pointer[p] = y_ik
-                weight_pointer[p] = w
+                y_vals[p] = y_ik
+                weights[p] = w
 
             for p in range(start, end):
-                sum_weights += weight_pointer[p]
+                sum_weights += weights[p]
                 
-            Sum = sum_weights - weight_pointer[0]
-
-            while(Sum > sum_weights/2):
+            sum = sum_weights - weights[0]
+            
+            while(sum > sum_weights/2):
                 median_index +=1
-                Sum -= weight_pointer[median_index]
+                sum -= weights[median_index]
 
-            with gil:
-                print "calculated weighted median:"
-                print y_val_pointer[median_index]
-            dest[k] = y_val_pointer[median_index]
-            with gil:
-                print "normally this isn't printed because of bus error: 10"
+            dest[k] = y_vals[median_index]
 
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of 
            samples[start:end]"""
-        with gil:
-            print "Entered node_impurity function"
-        cdef double* medians
+        cdef double* medians = <double *> calloc(self.n_outputs, sizeof(double))
         cdef double impurity = 0.0
         cdef SIZE_t* samples = self.samples
         cdef SIZE_t k
@@ -1033,16 +1021,12 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t i
         cdef DOUBLE_t y_ik
         self.node_value(medians)
-        with gil:
-            print "exited node_value"
         for k in range(self.n_outputs):
             for p in range(self.start, self.end):
                 i = samples[p]
                 y_ik = self.y[i * self.y_stride + k]
-
-                impurity += fabs(y_ik - medians[k]) / self.n_node_samples
+                impurity += fabs(y_ik - medians[k]) / self.weighted_n_node_samples
         return impurity / self.n_outputs
-        # todo
 
     cdef double proxy_impurity_improvement(self) nogil:
         """Compute a proxy of the impurity reduction

From 2fb76516f0d69c659aed3d5c5cc07692013611d1 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Mon, 18 Apr 2016 20:23:55 -0700
Subject: [PATCH 04/75] fix: node_value now correctly calculates median
 regardless of initial order

---
 sklearn/tree/_criterion.pyx | 43 +++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 79d65487536ca..36847ed448ceb 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -986,6 +986,7 @@ cdef class MAE(RegressionCriterion):
 
         cdef double* y_vals = <double*> calloc(self.n_node_samples, sizeof(double))
         cdef double* weights = <double*> calloc(self.n_node_samples, sizeof(double))
+        cdef SIZE_t* sorted_indexes
         for k in range(self.n_outputs):
             median_index = 0
             sum_weights = 0.0
@@ -1001,15 +1002,53 @@ cdef class MAE(RegressionCriterion):
 
             for p in range(start, end):
                 sum_weights += weights[p]
-                
+
+            self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1)
+            
             sum = sum_weights - weights[0]
             
             while(sum > sum_weights/2):
                 median_index +=1
                 sum -= weights[median_index]
 
-            dest[k] = y_vals[median_index]
+            if start-end % 2 == 0:
+                dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
+            else:
+                dest[k] = y_vals[median_index]
+                
+    cdef void sort_values_and_weights(self, double* y_vals, double* weights,
+                                      SIZE_t low, SIZE_t high) nogil:
+        """Sort an array and its weights"""
+        cdef SIZE_t pivot, i, j,
+        cdef double temp
+        if low < high:
+            pivot = low
+            i = low
+            j = high
+            while i < j:
+                while(y_vals[i] <= y_vals[pivot] and i <= high):
+                    i += 1
+                while(y_vals[j] > y_vals[pivot] and j  >= low):
+                    j -= 1
+                if i < j:
+                    temp = y_vals[i]
+                    y_vals[i] = y_vals[j]
+                    y_vals[j] = temp
+
+                    temp = weights[i]
+                    weights[i] = weights[j]
+                    weights[j] = temp
+            temp = y_vals[j]
+            y_vals[j] = y_vals[pivot]
+            y_vals[pivot] = temp
+
+            temp = weights[j]
+            weights[j] = weights[pivot]
+            weights[pivot] = temp
+            self.sort_values_and_weights(y_vals, weights, low, j-1)
+            self.sort_values_and_weights(y_vals, weights, j+1, high)
 
+        
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of 
            samples[start:end]"""

From a3f2f7651a55e096473c5b18ea88fc26853ee84f Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Mon, 18 Apr 2016 20:50:33 -0700
Subject: [PATCH 05/75] fix: correct bug in calculating median when taking
 midpoint is necessary

---
 sklearn/tree/_criterion.pyx | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 36847ed448ceb..6a7e1767b5017 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -999,11 +999,20 @@ cdef class MAE(RegressionCriterion):
 
                 y_vals[p] = y_ik
                 weights[p] = w
+                with gil:
+                    print "p {}".format(p)
+                    print "unsorted y val {}".format(y_vals[p])
+                    print "unsorted weight {}".format(weights[p])
 
             for p in range(start, end):
                 sum_weights += weights[p]
 
             self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1)
+            for p in range(start, end):
+                with gil:
+                    print "p {}".format(p)
+                    print "sorted y val {}".format(y_vals[p])
+                    print "sorted weight {}".format(weights[p])
             
             sum = sum_weights - weights[0]
             
@@ -1011,10 +1020,12 @@ cdef class MAE(RegressionCriterion):
                 median_index +=1
                 sum -= weights[median_index]
 
-            if start-end % 2 == 0:
+            if sum == sum_weights/2:
                 dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
             else:
                 dest[k] = y_vals[median_index]
+            with gil:
+                print dest[k]
                 
     cdef void sort_values_and_weights(self, double* y_vals, double* weights,
                                       SIZE_t low, SIZE_t high) nogil:

From c40a54b752be52024ebc0c4ff8b0080be68b42b5 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Tue, 19 Apr 2016 15:36:55 -0700
Subject: [PATCH 06/75] feature: add initial version of children_impurity

---
 sklearn/tree/_criterion.pyx | 77 ++++++++++++++++++++++++++++++-------
 1 file changed, 64 insertions(+), 13 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 6a7e1767b5017..c8d32aaf2fc96 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -999,20 +999,11 @@ cdef class MAE(RegressionCriterion):
 
                 y_vals[p] = y_ik
                 weights[p] = w
-                with gil:
-                    print "p {}".format(p)
-                    print "unsorted y val {}".format(y_vals[p])
-                    print "unsorted weight {}".format(weights[p])
 
             for p in range(start, end):
                 sum_weights += weights[p]
 
             self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1)
-            for p in range(start, end):
-                with gil:
-                    print "p {}".format(p)
-                    print "sorted y val {}".format(y_vals[p])
-                    print "sorted weight {}".format(weights[p])
             
             sum = sum_weights - weights[0]
             
@@ -1024,8 +1015,6 @@ cdef class MAE(RegressionCriterion):
                 dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
             else:
                 dest[k] = y_vals[median_index]
-            with gil:
-                print dest[k]
                 
     cdef void sort_values_and_weights(self, double* y_vals, double* weights,
                                       SIZE_t low, SIZE_t high) nogil:
@@ -1087,6 +1076,9 @@ cdef class MAE(RegressionCriterion):
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         """
+        cdef SIZE_t k
+        cdef double proxy_impurity_left = 0.0
+        cdef double proxy_impurity_right = 0.0
 
         # todo
         pass
@@ -1097,8 +1089,67 @@ cdef class MAE(RegressionCriterion):
            left child (samples[start:pos]) and the impurity the right child
            (samples[pos:end]).
         """
-        # todo
-        pass
+        cdef DOUBLE_t* y = self.y
+        cdef DOUBLE_t* sample_weight = self.sample_weight
+        cdef SIZE_t* samples = self.samples
+
+        cdef SIZE_t pos = self.pos
+        cdef SIZE_t start = self.start
+
+        cdef double impurity_total = self.node_impurity()
+
+        cdef DOUBLE_t sum_weights
+        cdef SIZE_t median_index
+        cdef DOUBLE_t sum
+
+        cdef SIZE_t i
+        cdef SIZE_t p
+        cdef SIZE_t k
+        cdef DOUBLE_t w = 1.0
+        cdef DOUBLE_t y_ik
+
+        cdef double* y_vals = <double*> calloc(self.n_node_samples, sizeof(double))
+        cdef double* weights = <double*> calloc(self.n_node_samples, sizeof(double))
+        cdef double* medians = <double *> calloc(self.n_outputs, sizeof(double))
+        
+        for k in range(self.n_outputs):
+            median_index = 0
+            sum_weights = 0.0
+            for p in range(start, pos):
+                i = samples[p]
+                y_ik = y[i * self.y_stride + k]
+
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                y_vals[p] = y_ik
+                weights[p] = w
+
+            for p in range(start, pos):
+                sum_weights += weights[p]
+
+            self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1)
+            sum = sum_weights - weights[0]
+
+            while(sum > sum_weights/2):
+                median_index +=1
+                sum -= weights[median_index]
+
+            if sum == sum_weights/2:
+                medians[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
+            else:
+                medians[k] = y_vals[median_index]
+
+        for k in range(self.n_outputs):
+            for p in range(start, pos):
+                i = samples[p]
+                y_ik = y[i * self.y_stride + k]
+                impurity_left[0] += fabs(y_ik - medians[k]) / (pos - start)
+
+        impurity_right[0] = impurity_total - impurity_left[0]
+
+        impurity_left[0] /= self.n_outputs
+        impurity_right[0] /= self.n_outputs
 
 cdef class FriedmanMSE(MSE):
     """Mean squared error impurity criterion with improvement score by Friedman

From 19e811dcab439a0254512d516ecbad2581fe0b7d Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Tue, 19 Apr 2016 16:33:03 -0700
Subject: [PATCH 07/75] feature: refactor median calculation into one function

---
 sklearn/tree/_criterion.pyx | 75 +++++++++++--------------------------
 1 file changed, 22 insertions(+), 53 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index c8d32aaf2fc96..f95c41ccf71a7 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -967,26 +967,30 @@ cdef class MAE(RegressionCriterion):
     """Mean absolute error impurity criterion
     """
     cdef void node_value(self, double* dest) nogil:
-        """Computes the node value of samples[start:end] into dest."""
-        cdef double* sample_weight = self.sample_weight
-        cdef SIZE_t* samples = self.samples
-        
-        cdef DOUBLE_t* y = self.y
+        """Computes the node value of samples[start:end] into dest."""        
         cdef SIZE_t start = self.start
         cdef SIZE_t end = self.end
-        cdef SIZE_t i
-        cdef SIZE_t p
-        cdef SIZE_t k
+
+        cdef double* y_vals = <double*> calloc(self.n_node_samples, sizeof(double))
+        cdef double* weights = <double*> calloc(self.n_node_samples, sizeof(double))
+        self.compute_weighted_median(dest, y_vals, weights, start, end)
+
+    cdef void compute_weighted_median(self, double* median_dest, double* y_vals,
+                                      double* weights, SIZE_t start, SIZE_t end) nogil:
+        """Calculate the weighted median and put it into a destination pointer
+        given values, weights, and a start and end index
+        """
+        cdef double* sample_weight = self.sample_weight
+        cdef DOUBLE_t* y = self.y
+        cdef SIZE_t* samples = self.samples
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t y_ik
 
+        cdef SIZE_t i, p, k
         cdef DOUBLE_t sum_weights
         cdef SIZE_t median_index
         cdef DOUBLE_t sum
 
-        cdef double* y_vals = <double*> calloc(self.n_node_samples, sizeof(double))
-        cdef double* weights = <double*> calloc(self.n_node_samples, sizeof(double))
-        cdef SIZE_t* sorted_indexes
         for k in range(self.n_outputs):
             median_index = 0
             sum_weights = 0.0
@@ -999,7 +1003,7 @@ cdef class MAE(RegressionCriterion):
 
                 y_vals[p] = y_ik
                 weights[p] = w
-
+        
             for p in range(start, end):
                 sum_weights += weights[p]
 
@@ -1012,9 +1016,10 @@ cdef class MAE(RegressionCriterion):
                 sum -= weights[median_index]
 
             if sum == sum_weights/2:
-                dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
+                median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
             else:
-                dest[k] = y_vals[median_index]
+                median_dest[k] = y_vals[median_index]
+            
                 
     cdef void sort_values_and_weights(self, double* y_vals, double* weights,
                                       SIZE_t low, SIZE_t high) nogil:
@@ -1076,9 +1081,6 @@ cdef class MAE(RegressionCriterion):
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         """
-        cdef SIZE_t k
-        cdef double proxy_impurity_left = 0.0
-        cdef double proxy_impurity_right = 0.0
 
         # todo
         pass
@@ -1098,48 +1100,15 @@ cdef class MAE(RegressionCriterion):
 
         cdef double impurity_total = self.node_impurity()
 
-        cdef DOUBLE_t sum_weights
-        cdef SIZE_t median_index
-        cdef DOUBLE_t sum
-
-        cdef SIZE_t i
-        cdef SIZE_t p
-        cdef SIZE_t k
-        cdef DOUBLE_t w = 1.0
+        cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
 
         cdef double* y_vals = <double*> calloc(self.n_node_samples, sizeof(double))
         cdef double* weights = <double*> calloc(self.n_node_samples, sizeof(double))
         cdef double* medians = <double *> calloc(self.n_outputs, sizeof(double))
+        self.compute_weighted_median(medians, y_vals, weights, start, pos)
+            
         
-        for k in range(self.n_outputs):
-            median_index = 0
-            sum_weights = 0.0
-            for p in range(start, pos):
-                i = samples[p]
-                y_ik = y[i * self.y_stride + k]
-
-                if sample_weight != NULL:
-                    w = sample_weight[i]
-
-                y_vals[p] = y_ik
-                weights[p] = w
-
-            for p in range(start, pos):
-                sum_weights += weights[p]
-
-            self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1)
-            sum = sum_weights - weights[0]
-
-            while(sum > sum_weights/2):
-                median_index +=1
-                sum -= weights[median_index]
-
-            if sum == sum_weights/2:
-                medians[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
-            else:
-                medians[k] = y_vals[median_index]
-
         for k in range(self.n_outputs):
             for p in range(start, pos):
                 i = samples[p]

From 31f04b40f2418a87146e64da9503b07735398817 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 21 Apr 2016 15:08:27 -0700
Subject: [PATCH 08/75] fix: fix use of DOUBLE_t vs double

---
 sklearn/tree/_criterion.pyx | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index f95c41ccf71a7..e038c38173b79 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -971,12 +971,12 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t start = self.start
         cdef SIZE_t end = self.end
 
-        cdef double* y_vals = <double*> calloc(self.n_node_samples, sizeof(double))
-        cdef double* weights = <double*> calloc(self.n_node_samples, sizeof(double))
+        cdef double* y_vals = <DOUBLE_t*> calloc(self.n_node_samples, sizeof(DOUBLE_t))
+        cdef double* weights = <DOUBLE_t*> calloc(self.n_node_samples, sizeof(DOUBLE_t))
         self.compute_weighted_median(dest, y_vals, weights, start, end)
 
-    cdef void compute_weighted_median(self, double* median_dest, double* y_vals,
-                                      double* weights, SIZE_t start, SIZE_t end) nogil:
+    cdef void compute_weighted_median(self, double* median_dest, DOUBLE_t* y_vals,
+                                      DOUBLE_t* weights, SIZE_t start, SIZE_t end) nogil:
         """Calculate the weighted median and put it into a destination pointer
         given values, weights, and a start and end index
         """
@@ -1021,7 +1021,7 @@ cdef class MAE(RegressionCriterion):
                 median_dest[k] = y_vals[median_index]
             
                 
-    cdef void sort_values_and_weights(self, double* y_vals, double* weights,
+    cdef void sort_values_and_weights(self, DOUBLE_t* y_vals, DOUBLE_t* weights,
                                       SIZE_t low, SIZE_t high) nogil:
         """Sort an array and its weights"""
         cdef SIZE_t pivot, i, j,
@@ -1052,7 +1052,6 @@ cdef class MAE(RegressionCriterion):
             weights[pivot] = temp
             self.sort_values_and_weights(y_vals, weights, low, j-1)
             self.sort_values_and_weights(y_vals, weights, j+1, high)
-
         
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of 
@@ -1103,8 +1102,8 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
 
-        cdef double* y_vals = <double*> calloc(self.n_node_samples, sizeof(double))
-        cdef double* weights = <double*> calloc(self.n_node_samples, sizeof(double))
+        cdef DOUBLE_t* y_vals = <DOUBLE_t*> calloc(self.n_node_samples, sizeof(DOUBLE_t))
+        cdef DOUBLE_t* weights = <DOUBLE_t*> calloc(self.n_node_samples, sizeof(DOUBLE_t))
         cdef double* medians = <double *> calloc(self.n_outputs, sizeof(double))
         self.compute_weighted_median(medians, y_vals, weights, start, pos)
             

From ffff6166468a1a43bf228e7b2276edb734b067e9 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 11 May 2016 23:49:31 -0700
Subject: [PATCH 09/75] feature: move helper functions to _utils.pyx, fix
 mismatched pointer type

---
 sklearn/tree/_criterion.pyx | 119 ++++++++----------------------------
 sklearn/tree/_utils.pxd     |   7 +++
 sklearn/tree/_utils.pyx     |  84 +++++++++++++++++++++++++
 3 files changed, 115 insertions(+), 95 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index e038c38173b79..20d1eb3d7c6ef 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -28,6 +28,7 @@ np.import_array()
 from ._utils cimport log
 from ._utils cimport safe_realloc
 from ._utils cimport sizet_ptr_to_ndarray
+from ._utils cimport compute_weighted_median
 
 cdef class Criterion:
     """Interface for impurity criteria.
@@ -848,7 +849,7 @@ cdef class RegressionCriterion(Criterion):
 
                 self.weighted_n_left -= w
 
-        self.weighted_n_right = (self.weighted_n_node_samples - 
+        self.weighted_n_right = (self.weighted_n_node_samples -
                                  self.weighted_n_left)
         for k in range(self.n_outputs):
             sum_right[k] = sum_total[k] - sum_left[k]
@@ -922,7 +923,6 @@ cdef class MSE(RegressionCriterion):
            left child (samples[start:pos]) and the impurity the right child
            (samples[pos:end])."""
 
-
         cdef DOUBLE_t* y = self.y
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
@@ -958,7 +958,7 @@ cdef class MSE(RegressionCriterion):
 
         for k in range(self.n_outputs):
             impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0
-            impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 
+            impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0
 
         impurity_left[0] /= self.n_outputs
         impurity_right[0] /= self.n_outputs
@@ -967,101 +967,26 @@ cdef class MAE(RegressionCriterion):
     """Mean absolute error impurity criterion
     """
     cdef void node_value(self, double* dest) nogil:
-        """Computes the node value of samples[start:end] into dest."""        
+        """Computes the node value of samples[start:end] into dest."""
         cdef SIZE_t start = self.start
         cdef SIZE_t end = self.end
 
-        cdef double* y_vals = <DOUBLE_t*> calloc(self.n_node_samples, sizeof(DOUBLE_t))
-        cdef double* weights = <DOUBLE_t*> calloc(self.n_node_samples, sizeof(DOUBLE_t))
-        self.compute_weighted_median(dest, y_vals, weights, start, end)
-
-    cdef void compute_weighted_median(self, double* median_dest, DOUBLE_t* y_vals,
-                                      DOUBLE_t* weights, SIZE_t start, SIZE_t end) nogil:
-        """Calculate the weighted median and put it into a destination pointer
-        given values, weights, and a start and end index
-        """
-        cdef double* sample_weight = self.sample_weight
-        cdef DOUBLE_t* y = self.y
-        cdef SIZE_t* samples = self.samples
-        cdef DOUBLE_t w = 1.0
-        cdef DOUBLE_t y_ik
-
-        cdef SIZE_t i, p, k
-        cdef DOUBLE_t sum_weights
-        cdef SIZE_t median_index
-        cdef DOUBLE_t sum
-
-        for k in range(self.n_outputs):
-            median_index = 0
-            sum_weights = 0.0
-            for p in range(start,end):
-                i = samples[p]
-                
-                y_ik = y[i * self.y_stride + k]
-                if sample_weight != NULL:
-                    w = sample_weight[i]
+        cdef DOUBLE_t* y_vals = <DOUBLE_t*> calloc(self.n_node_samples,
+                                                 sizeof(DOUBLE_t))
+        cdef DOUBLE_t* weights = <DOUBLE_t*> calloc(self.n_node_samples,
+                                                  sizeof(DOUBLE_t))
+        compute_weighted_median(dest, y_vals, weights, start, end,
+                                self.sample_weight, self.y, self.samples,
+                                self.y_stride, self.n_node_samples,
+                                self.n_outputs)
 
-                y_vals[p] = y_ik
-                weights[p] = w
-        
-            for p in range(start, end):
-                sum_weights += weights[p]
-
-            self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1)
-            
-            sum = sum_weights - weights[0]
-            
-            while(sum > sum_weights/2):
-                median_index +=1
-                sum -= weights[median_index]
-
-            if sum == sum_weights/2:
-                median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
-            else:
-                median_dest[k] = y_vals[median_index]
-            
-                
-    cdef void sort_values_and_weights(self, DOUBLE_t* y_vals, DOUBLE_t* weights,
-                                      SIZE_t low, SIZE_t high) nogil:
-        """Sort an array and its weights"""
-        cdef SIZE_t pivot, i, j,
-        cdef double temp
-        if low < high:
-            pivot = low
-            i = low
-            j = high
-            while i < j:
-                while(y_vals[i] <= y_vals[pivot] and i <= high):
-                    i += 1
-                while(y_vals[j] > y_vals[pivot] and j  >= low):
-                    j -= 1
-                if i < j:
-                    temp = y_vals[i]
-                    y_vals[i] = y_vals[j]
-                    y_vals[j] = temp
-
-                    temp = weights[i]
-                    weights[i] = weights[j]
-                    weights[j] = temp
-            temp = y_vals[j]
-            y_vals[j] = y_vals[pivot]
-            y_vals[pivot] = temp
-
-            temp = weights[j]
-            weights[j] = weights[pivot]
-            weights[pivot] = temp
-            self.sort_values_and_weights(y_vals, weights, low, j-1)
-            self.sort_values_and_weights(y_vals, weights, j+1, high)
-        
     cdef double node_impurity(self) nogil:
-        """Evaluate the impurity of the current node, i.e. the impurity of 
+        """Evaluate the impurity of the current node, i.e. the impurity of
            samples[start:end]"""
         cdef double* medians = <double *> calloc(self.n_outputs, sizeof(double))
         cdef double impurity = 0.0
         cdef SIZE_t* samples = self.samples
-        cdef SIZE_t k
-        cdef SIZE_t p
-        cdef SIZE_t i
+        cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
         self.node_value(medians)
         for k in range(self.n_outputs):
@@ -1102,12 +1027,16 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
 
-        cdef DOUBLE_t* y_vals = <DOUBLE_t*> calloc(self.n_node_samples, sizeof(DOUBLE_t))
-        cdef DOUBLE_t* weights = <DOUBLE_t*> calloc(self.n_node_samples, sizeof(DOUBLE_t))
+        cdef DOUBLE_t* y_vals = <DOUBLE_t*> calloc(self.n_node_samples,
+                                                   sizeof(DOUBLE_t))
+        cdef DOUBLE_t* weights = <DOUBLE_t*> calloc(self.n_node_samples,
+                                                    sizeof(DOUBLE_t))
         cdef double* medians = <double *> calloc(self.n_outputs, sizeof(double))
-        self.compute_weighted_median(medians, y_vals, weights, start, pos)
-            
-        
+        compute_weighted_median(medians, y_vals, weights, start, pos,
+                                self.sample_weight, self.y, self.samples,
+                                self.y_stride, self.n_node_samples,
+                                self.n_outputs)
+
         for k in range(self.n_outputs):
             for p in range(start, pos):
                 i = samples[p]
@@ -1175,5 +1104,5 @@ cdef class FriedmanMSE(MSE):
         diff = (self.weighted_n_right * total_sum_left -
                 self.weighted_n_left * total_sum_right) / self.n_outputs
 
-        return (diff * diff / (self.weighted_n_left * self.weighted_n_right * 
+        return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
                                self.weighted_n_node_samples))
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 9537bbb91cf27..69e023ce83961 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -39,6 +39,13 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except *
 
 cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size)
 
+cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals,
+                                  DOUBLE_t* weights, SIZE_t start, SIZE_t end,
+                                  DOUBLE_t* sample_weight, DOUBLE_t* y,
+                                  SIZE_t* samples, SIZE_t y_stride,
+                                  SIZE_t n_node_samples,
+                                  SIZE_t n_outputs) nogil
+
 
 cdef SIZE_t rand_int(SIZE_t low, SIZE_t high,
                             UINT32_t* random_state) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 6a3833128b5fa..5f508521c2069 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -49,6 +49,90 @@ def _realloc_test():
         assert False
 
 
+cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals,
+                                  DOUBLE_t* weights, SIZE_t start, SIZE_t end,
+                                  DOUBLE_t* sample_weight, DOUBLE_t* y,
+                                  SIZE_t* samples, SIZE_t y_stride,
+                                  SIZE_t n_node_samples,
+                                  SIZE_t n_outputs) nogil:
+    """Calculate the weighted median and put it into a destination pointer
+    given values, weights, and a start and end index
+    """
+    # cdef DOUBLE_t* sample_weight = self.sample_weight
+    # cdef DOUBLE_t* y = self.y
+    # cdef SIZE_t* samples = self.samples
+    cdef DOUBLE_t w = 1.0
+    cdef DOUBLE_t y_ik
+
+    cdef SIZE_t i, p, k
+    cdef DOUBLE_t sum_weights
+    cdef SIZE_t median_index
+    cdef DOUBLE_t sum
+
+    for k in range(n_outputs):
+        median_index = 0
+        sum_weights = 0.0
+        for p in range(start,end):
+            i = samples[p]
+
+            y_ik = y[i * y_stride + k]
+            if sample_weight != NULL:
+                w = sample_weight[i]
+
+            y_vals[p] = y_ik
+            weights[p] = w
+
+        for p in range(start, end):
+            sum_weights += weights[p]
+
+        # self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1)
+        sort_values_and_weights(y_vals, weights, 0, n_node_samples - 1)
+
+        sum = sum_weights - weights[0]
+
+        while(sum > sum_weights/2):
+            median_index +=1
+            sum -= weights[median_index]
+
+        if sum == sum_weights/2:
+            median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
+        else:
+            median_dest[k] = y_vals[median_index]
+
+
+cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,
+                                  SIZE_t low, SIZE_t high) nogil:
+    """Sort an array and its weights"""
+    cdef SIZE_t pivot, i, j,
+    cdef double temp
+    if low < high:
+        pivot = low
+        i = low
+        j = high
+        while i < j:
+            while(y_vals[i] <= y_vals[pivot] and i <= high):
+                i += 1
+            while(y_vals[j] > y_vals[pivot] and j  >= low):
+                j -= 1
+            if i < j:
+                temp = y_vals[i]
+                y_vals[i] = y_vals[j]
+                y_vals[j] = temp
+
+                temp = weights[i]
+                weights[i] = weights[j]
+                weights[j] = temp
+        temp = y_vals[j]
+        y_vals[j] = y_vals[pivot]
+        y_vals[pivot] = temp
+
+        temp = weights[j]
+        weights[j] = weights[pivot]
+        weights[pivot] = temp
+        sort_values_and_weights(y_vals, weights, low, j-1)
+        sort_values_and_weights(y_vals, weights, j+1, high)
+
+
 # rand_r replacement using a 32bit XorShift generator
 # See http://www.jstatsoft.org/v08/i14/paper for details
 cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil:

From bfde38d4fdcceb5c61d49dc915233b4feef21236 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sat, 14 May 2016 16:58:38 -0700
Subject: [PATCH 10/75] fix: fix some bugs in children_impurity method

---
 sklearn/tree/_criterion.pxd |  2 +-
 sklearn/tree/_criterion.pyx | 21 +++++++++++++--------
 sklearn/tree/_utils.pxd     |  1 -
 sklearn/tree/_utils.pyx     |  9 ++++-----
 4 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 889a623d732b3..172d57659e6a6 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -44,7 +44,7 @@ cdef class Criterion:
                                     # weighted count of each label. For regression,
                                     # the sum of w*y. sum_total[k] is equal to
                                     # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k],
-                                    # where k is output index. 
+                                    # where k is output index.
     cdef double* sum_left           # Same as above, but for the left side of the split
     cdef double* sum_right          # same as above, but for the right side of the split
 
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 20d1eb3d7c6ef..3c5d61dc3ffed 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -977,8 +977,7 @@ cdef class MAE(RegressionCriterion):
                                                   sizeof(DOUBLE_t))
         compute_weighted_median(dest, y_vals, weights, start, end,
                                 self.sample_weight, self.y, self.samples,
-                                self.y_stride, self.n_node_samples,
-                                self.n_outputs)
+                                self.y_stride, self.n_outputs)
 
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of
@@ -994,6 +993,8 @@ cdef class MAE(RegressionCriterion):
                 i = samples[p]
                 y_ik = self.y[i * self.y_stride + k]
                 impurity += fabs(y_ik - medians[k]) / self.weighted_n_node_samples
+        with gil:
+            print "impurity / self.n_outputs = {} / {} = {}".format(impurity, self.n_outputs, impurity / self.n_outputs)
         return impurity / self.n_outputs
 
     cdef double proxy_impurity_improvement(self) nogil:
@@ -1019,8 +1020,9 @@ cdef class MAE(RegressionCriterion):
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
 
-        cdef SIZE_t pos = self.pos
         cdef SIZE_t start = self.start
+        cdef SIZE_t pos = self.pos
+        cdef SIZE_t end = self.end
 
         cdef double impurity_total = self.node_impurity()
 
@@ -1031,11 +1033,10 @@ cdef class MAE(RegressionCriterion):
                                                    sizeof(DOUBLE_t))
         cdef DOUBLE_t* weights = <DOUBLE_t*> calloc(self.n_node_samples,
                                                     sizeof(DOUBLE_t))
-        cdef double* medians = <double *> calloc(self.n_outputs, sizeof(double))
+        cdef double* medians = <double*> calloc(self.n_outputs, sizeof(double))
         compute_weighted_median(medians, y_vals, weights, start, pos,
                                 self.sample_weight, self.y, self.samples,
-                                self.y_stride, self.n_node_samples,
-                                self.n_outputs)
+                                self.y_stride, self.n_outputs)
 
         for k in range(self.n_outputs):
             for p in range(start, pos):
@@ -1044,9 +1045,13 @@ cdef class MAE(RegressionCriterion):
                 impurity_left[0] += fabs(y_ik - medians[k]) / (pos - start)
 
         impurity_right[0] = impurity_total - impurity_left[0]
+        with gil:
+            print "start: {}".format(start)
+            print "pos: {}".format(pos)
+            print "end: {}".format(end)
+            print "impurity_left[0]: {}".format(impurity_left[0])
+            print "impurity_right[0]: {}".format(impurity_right[0])
 
-        impurity_left[0] /= self.n_outputs
-        impurity_right[0] /= self.n_outputs
 
 cdef class FriedmanMSE(MSE):
     """Mean squared error impurity criterion with improvement score by Friedman
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 69e023ce83961..cafecca9d124f 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -43,7 +43,6 @@ cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals,
                                   DOUBLE_t* weights, SIZE_t start, SIZE_t end,
                                   DOUBLE_t* sample_weight, DOUBLE_t* y,
                                   SIZE_t* samples, SIZE_t y_stride,
-                                  SIZE_t n_node_samples,
                                   SIZE_t n_outputs) nogil
 
 
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 5f508521c2069..72a4e1c828b2e 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -53,16 +53,17 @@ cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals,
                                   DOUBLE_t* weights, SIZE_t start, SIZE_t end,
                                   DOUBLE_t* sample_weight, DOUBLE_t* y,
                                   SIZE_t* samples, SIZE_t y_stride,
-                                  SIZE_t n_node_samples,
                                   SIZE_t n_outputs) nogil:
-    """Calculate the weighted median and put it into a destination pointer
-    given values, weights, and a start and end index
+    """Calculate the weighted median of samples[start:end] and put
+    it into a destination pointer
+    given values, weights, and a start and end index.
     """
     # cdef DOUBLE_t* sample_weight = self.sample_weight
     # cdef DOUBLE_t* y = self.y
     # cdef SIZE_t* samples = self.samples
     cdef DOUBLE_t w = 1.0
     cdef DOUBLE_t y_ik
+    cdef SIZE_t n_node_samples = end-start
 
     cdef SIZE_t i, p, k
     cdef DOUBLE_t sum_weights
@@ -85,9 +86,7 @@ cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals,
         for p in range(start, end):
             sum_weights += weights[p]
 
-        # self.sort_values_and_weights(y_vals, weights, 0, self.n_node_samples - 1)
         sort_values_and_weights(y_vals, weights, 0, n_node_samples - 1)
-
         sum = sum_weights - weights[0]
 
         while(sum > sum_weights/2):

From 8b77de01bf4bb916cc111fc1e075f528f351a6e6 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 18 May 2016 19:18:13 -0700
Subject: [PATCH 11/75] push a debug version to try to solve segfault

---
 sklearn/tree/_criterion.pyx | 84 +++++++++++++++++++++++--------------
 sklearn/tree/_splitter.pyx  |  6 ++-
 sklearn/tree/_tree.pyx      |  6 ++-
 sklearn/tree/_utils.pyx     | 16 +++----
 sklearn/tree/tree.py        |  2 +-
 5 files changed, 71 insertions(+), 43 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 3c5d61dc3ffed..0a0186fdd6390 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -39,10 +39,11 @@ cdef class Criterion:
 
     def __dealloc__(self):
         """Destructor."""
-
+        print "entered criterion dealloc"
         free(self.sum_total)
         free(self.sum_left)
         free(self.sum_right)
+        print "exited criterion dealloc"
 
     def __getstate__(self):
         return {}
@@ -170,6 +171,7 @@ cdef class Criterion:
         return (- self.weighted_n_right * impurity_right
                 - self.weighted_n_left * impurity_left)
 
+
     cdef double impurity_improvement(self, double impurity) nogil:
         """Placeholder for improvement in impurity after a split.
 
@@ -200,9 +202,9 @@ cdef class Criterion:
         self.children_impurity(&impurity_left, &impurity_right)
 
         return ((self.weighted_n_node_samples / self.weighted_n_samples) *
-                (impurity - (self.weighted_n_right / 
+                (impurity - (self.weighted_n_right /
                              self.weighted_n_node_samples * impurity_right)
-                          - (self.weighted_n_left / 
+                          - (self.weighted_n_left /
                              self.weighted_n_node_samples * impurity_left)))
 
 
@@ -265,14 +267,14 @@ cdef class ClassificationCriterion(Criterion):
         self.sum_left = <double*> calloc(n_elements, sizeof(double))
         self.sum_right = <double*> calloc(n_elements, sizeof(double))
 
-        if (self.sum_total == NULL or 
+        if (self.sum_total == NULL or
                 self.sum_left == NULL or
                 self.sum_right == NULL):
             raise MemoryError()
 
     def __dealloc__(self):
         """Destructor."""
-
+        print "entered classificationcriterion dealloc"
         free(self.n_classes)
 
     def __reduce__(self):
@@ -724,7 +726,8 @@ cdef class RegressionCriterion(Criterion):
         self.sum_left = <double*> calloc(n_outputs, sizeof(double))
         self.sum_right = <double*> calloc(n_outputs, sizeof(double))
 
-        if (self.sum_total == NULL or 
+
+        if (self.sum_total == NULL or
                 self.sum_left == NULL or
                 self.sum_right == NULL):
             raise MemoryError()
@@ -970,11 +973,15 @@ cdef class MAE(RegressionCriterion):
         """Computes the node value of samples[start:end] into dest."""
         cdef SIZE_t start = self.start
         cdef SIZE_t end = self.end
-
-        cdef DOUBLE_t* y_vals = <DOUBLE_t*> calloc(self.n_node_samples,
-                                                 sizeof(DOUBLE_t))
-        cdef DOUBLE_t* weights = <DOUBLE_t*> calloc(self.n_node_samples,
-                                                  sizeof(DOUBLE_t))
+        cdef double* y_vals = NULL
+        cdef double* weights = NULL
+        y_vals = <double*> calloc(self.n_node_samples,
+                                  sizeof(double))
+        weights = <double*> calloc(self.n_node_samples,
+                                   sizeof(double))
+        if (y_vals == NULL or weights == NULL):
+            with gil:
+                raise MemoryError()
         compute_weighted_median(dest, y_vals, weights, start, end,
                                 self.sample_weight, self.y, self.samples,
                                 self.y_stride, self.n_outputs)
@@ -982,7 +989,13 @@ cdef class MAE(RegressionCriterion):
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of
            samples[start:end]"""
-        cdef double* medians = <double *> calloc(self.n_outputs, sizeof(double))
+        with gil:
+            print "entered node_impurity"
+        cdef double* medians = NULL
+        medians = <double *> calloc(self.n_outputs, sizeof(double))
+        if (medians == NULL):
+            with gil:
+                raise MemoryError()
         cdef double impurity = 0.0
         cdef SIZE_t* samples = self.samples
         cdef SIZE_t i, p, k
@@ -992,23 +1005,23 @@ cdef class MAE(RegressionCriterion):
             for p in range(self.start, self.end):
                 i = samples[p]
                 y_ik = self.y[i * self.y_stride + k]
-                impurity += fabs(y_ik - medians[k]) / self.weighted_n_node_samples
+                impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples)
         with gil:
             print "impurity / self.n_outputs = {} / {} = {}".format(impurity, self.n_outputs, impurity / self.n_outputs)
         return impurity / self.n_outputs
 
-    cdef double proxy_impurity_improvement(self) nogil:
-        """Compute a proxy of the impurity reduction
-        This method is used to speed up the search for the best split.
-        It is a proxy quantity such that the split that maximizes this value
-        also maximizes the impurity improvement. It neglects all constant terms
-        of the impurity decrease for a given split.
-        The absolute impurity improvement is only computed by the
-        impurity_improvement method once the best split has been found.
-        """
+    # cdef double proxy_impurity_improvement(self) nogil:
+    #     """Compute a proxy of the impurity reduction
+    #     This method is used to speed up the search for the best split.
+    #     It is a proxy quantity such that the split that maximizes this value
+    #     also maximizes the impurity improvement. It neglects all constant terms
+    #     of the impurity decrease for a given split.
+    #     The absolute impurity improvement is only computed by the
+    #     impurity_improvement method once the best split has been found.
+    #     """
 
-        # todo
-        pass
+    #     # todo
+    #     pass
 
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) nogil:
@@ -1029,11 +1042,18 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
 
-        cdef DOUBLE_t* y_vals = <DOUBLE_t*> calloc(self.n_node_samples,
-                                                   sizeof(DOUBLE_t))
-        cdef DOUBLE_t* weights = <DOUBLE_t*> calloc(self.n_node_samples,
-                                                    sizeof(DOUBLE_t))
-        cdef double* medians = <double*> calloc(self.n_outputs, sizeof(double))
+        cdef double* y_vals = NULL
+        cdef double* weights = NULL
+        cdef double* medians = NULL
+
+        y_vals = <double*> calloc(self.n_node_samples,
+                                                   sizeof(double))
+        weights = <double*> calloc(self.n_node_samples,
+                                                    sizeof(double))
+        medians = <double*> calloc(self.n_outputs, sizeof(double))
+        if (y_vals == NULL or weights == NULL or medians == NULL):
+            with gil:
+                raise MemoryError()
         compute_weighted_median(medians, y_vals, weights, start, pos,
                                 self.sample_weight, self.y, self.samples,
                                 self.y_stride, self.n_outputs)
@@ -1046,9 +1066,9 @@ cdef class MAE(RegressionCriterion):
 
         impurity_right[0] = impurity_total - impurity_left[0]
         with gil:
-            print "start: {}".format(start)
-            print "pos: {}".format(pos)
-            print "end: {}".format(end)
+            # print "start: {}".format(start)
+            # print "pos: {}".format(pos)
+            # print "end: {}".format(end)
             print "impurity_left[0]: {}".format(impurity_left[0])
             print "impurity_right[0]: {}".format(impurity_right[0])
 
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 0617508aab236..c2ea129a86f25 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -104,11 +104,12 @@ cdef class Splitter:
 
     def __dealloc__(self):
         """Destructor."""
-
+        print "entered splitter dealloc"
         free(self.samples)
         free(self.features)
         free(self.constant_features)
         free(self.feature_values)
+        print "exited splitter dealloc"
 
     def __getstate__(self):
         return {}
@@ -253,9 +254,11 @@ cdef class BaseDenseSplitter(Splitter):
         self.presort = presort
 
     def __dealloc__(self):
+        print "entered basedensesplitter dealloc"
         """Destructor."""
         if self.presort == 1:
             free(self.sample_mask)
+        print "exited basedensesplitter dealloc"
 
     cdef void init(self,
                    object X,
@@ -861,6 +864,7 @@ cdef class BaseSparseSplitter(Splitter):
         self.sorted_samples = NULL
 
     def __dealloc__(self):
+        print "entered basesparsesplitter dealloc"
         """Deallocate memory."""
         free(self.index_to_samples)
         free(self.sorted_samples)
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index f44320a7b47ae..d0f384589a626 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -264,7 +264,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 tree.max_depth = max_depth_seen
         if rc == -1:
             raise MemoryError()
-
+        print "dont building"
 
 # Best first builder ----------------------------------------------------------
 
@@ -603,6 +603,7 @@ cdef class Tree:
 
     def __dealloc__(self):
         """Destructor."""
+        print "entered tree dealloc"
         # Free all inner structures
         free(self.n_classes)
         free(self.value)
@@ -798,6 +799,7 @@ cdef class Tree:
     cdef inline np.ndarray _apply_sparse_csr(self, object X):
         """Finds the terminal region (=leaf node) for each sample in sparse X.
         """
+        print "entered _apply_sparse_csr in tree.pyx"
         # Check input
         if not isinstance(X, csr_matrix):
             raise ValueError("X should be in csr_matrix format, got %s"
@@ -939,7 +941,7 @@ cdef class Tree:
 
     cdef inline object _decision_path_sparse_csr(self, object X):
         """Finds the decision path (=node) for each sample in X."""
-
+        print "entered _decision_path_sparse_csr in tree.pyx"
         # Check input
         if not isinstance(X, csr_matrix):
             raise ValueError("X should be in csr_matrix format, got %s"
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 72a4e1c828b2e..4c70df1159af5 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -45,12 +45,13 @@ def _realloc_test():
     cdef SIZE_t* p = NULL
     safe_realloc(&p, <size_t>(-1) / 2)
     if p != NULL:
+        print "entered free in dealloc test"
         free(p)
         assert False
 
 
-cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals,
-                                  DOUBLE_t* weights, SIZE_t start, SIZE_t end,
+cdef void compute_weighted_median(double* median_dest, double* y_vals,
+                                  double* weights, SIZE_t start, SIZE_t end,
                                   DOUBLE_t* sample_weight, DOUBLE_t* y,
                                   SIZE_t* samples, SIZE_t y_stride,
                                   SIZE_t n_outputs) nogil:
@@ -58,12 +59,9 @@ cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals,
     it into a destination pointer
     given values, weights, and a start and end index.
     """
-    # cdef DOUBLE_t* sample_weight = self.sample_weight
-    # cdef DOUBLE_t* y = self.y
-    # cdef SIZE_t* samples = self.samples
     cdef DOUBLE_t w = 1.0
     cdef DOUBLE_t y_ik
-    cdef SIZE_t n_node_samples = end-start
+    cdef SIZE_t n_node_samples = end - start
 
     cdef SIZE_t i, p, k
     cdef DOUBLE_t sum_weights
@@ -99,7 +97,8 @@ cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals,
             median_dest[k] = y_vals[median_index]
 
 
-cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,
+
+cdef void sort_values_and_weights(double* y_vals, double* weights,
                                   SIZE_t low, SIZE_t high) nogil:
     """Sort an array and its weights"""
     cdef SIZE_t pivot, i, j,
@@ -194,7 +193,9 @@ cdef class Stack:
             raise MemoryError()
 
     def __dealloc__(self):
+        print "entered dealloc in stack"
         free(self.stack_)
+        print "exited dealloc in stack"
 
     cdef bint is_empty(self) nogil:
         return self.top <= 0
@@ -316,6 +317,7 @@ cdef class PriorityHeap:
             raise MemoryError()
 
     def __dealloc__(self):
+        print "entered dealloc in priorityheap"
         free(self.heap_)
 
     cdef bint is_empty(self) nogil:
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 4f8ebf9e960ed..d61fe54fa1198 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -371,7 +371,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
         if self.n_outputs_ == 1:
             self.n_classes_ = self.n_classes_[0]
             self.classes_ = self.classes_[0]
-
+        print "done fitting"
         return self
 
     def _validate_X_predict(self, X, check_input):

From adb244d8b41f0ab25ad72224dbb46c02b28c8387 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 18 May 2016 21:48:45 -0700
Subject: [PATCH 12/75] push latest changes, segfault probably happening bc of
 something in _utils.pyx

---
 sklearn/tree/_criterion.pyx | 33 ++++++++++-----------------------
 sklearn/tree/_utils.pxd     |  3 +--
 sklearn/tree/_utils.pyx     | 31 ++++++++++++++++++++++---------
 3 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 0a0186fdd6390..c31882fc7b838 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -973,18 +973,9 @@ cdef class MAE(RegressionCriterion):
         """Computes the node value of samples[start:end] into dest."""
         cdef SIZE_t start = self.start
         cdef SIZE_t end = self.end
-        cdef double* y_vals = NULL
-        cdef double* weights = NULL
-        y_vals = <double*> calloc(self.n_node_samples,
-                                  sizeof(double))
-        weights = <double*> calloc(self.n_node_samples,
-                                   sizeof(double))
-        if (y_vals == NULL or weights == NULL):
-            with gil:
-                raise MemoryError()
-        compute_weighted_median(dest, y_vals, weights, start, end,
-                                self.sample_weight, self.y, self.samples,
-                                self.y_stride, self.n_outputs)
+
+        compute_weighted_median(dest, start, end, self.sample_weight, self.y,
+                                self.samples, self.y_stride, self.n_outputs)
 
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of
@@ -992,7 +983,7 @@ cdef class MAE(RegressionCriterion):
         with gil:
             print "entered node_impurity"
         cdef double* medians = NULL
-        medians = <double *> calloc(self.n_outputs, sizeof(double))
+        medians = <DOUBLE_t *> calloc(self.n_outputs, sizeof(double))
         if (medians == NULL):
             with gil:
                 raise MemoryError()
@@ -1042,21 +1033,17 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
 
-        cdef double* y_vals = NULL
-        cdef double* weights = NULL
         cdef double* medians = NULL
 
-        y_vals = <double*> calloc(self.n_node_samples,
-                                                   sizeof(double))
-        weights = <double*> calloc(self.n_node_samples,
-                                                    sizeof(double))
         medians = <double*> calloc(self.n_outputs, sizeof(double))
-        if (y_vals == NULL or weights == NULL or medians == NULL):
+        if (medians == NULL):
             with gil:
                 raise MemoryError()
-        compute_weighted_median(medians, y_vals, weights, start, pos,
-                                self.sample_weight, self.y, self.samples,
-                                self.y_stride, self.n_outputs)
+        for k in range(self.n_outputs):
+            medians[k] = k
+        compute_weighted_median(medians, start, pos, self.sample_weight,
+                                self.y, self.samples, self.y_stride,
+                                self.n_outputs)
 
         for k in range(self.n_outputs):
             for p in range(start, pos):
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index cafecca9d124f..0678675ab1175 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -39,8 +39,7 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except *
 
 cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size)
 
-cdef void compute_weighted_median(double* median_dest, DOUBLE_t* y_vals,
-                                  DOUBLE_t* weights, SIZE_t start, SIZE_t end,
+cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
                                   DOUBLE_t* sample_weight, DOUBLE_t* y,
                                   SIZE_t* samples, SIZE_t y_stride,
                                   SIZE_t n_outputs) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 4c70df1159af5..5afbb1051924a 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -12,9 +12,11 @@
 
 from libc.stdlib cimport free
 from libc.stdlib cimport malloc
+from libc.stdlib cimport calloc
 from libc.stdlib cimport realloc
 from libc.math cimport log as ln
 
+
 import numpy as np
 cimport numpy as np
 np.import_array()
@@ -50,8 +52,7 @@ def _realloc_test():
         assert False
 
 
-cdef void compute_weighted_median(double* median_dest, double* y_vals,
-                                  double* weights, SIZE_t start, SIZE_t end,
+cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
                                   DOUBLE_t* sample_weight, DOUBLE_t* y,
                                   SIZE_t* samples, SIZE_t y_stride,
                                   SIZE_t n_outputs) nogil:
@@ -68,6 +69,18 @@ cdef void compute_weighted_median(double* median_dest, double* y_vals,
     cdef SIZE_t median_index
     cdef DOUBLE_t sum
 
+    cdef DOUBLE_t* y_vals = NULL
+    cdef DOUBLE_t* weights = NULL
+    y_vals = <DOUBLE_t*> calloc(n_node_samples,
+                                sizeof(DOUBLE_t))
+    weights = <DOUBLE_t*> calloc(n_node_samples,
+                                 sizeof(DOUBLE_t))
+
+    if (y_vals == NULL or weights == NULL):
+        with gil:
+            raise MemoryError()
+
+
     for k in range(n_outputs):
         median_index = 0
         sum_weights = 0.0
@@ -78,17 +91,17 @@ cdef void compute_weighted_median(double* median_dest, double* y_vals,
             if sample_weight != NULL:
                 w = sample_weight[i]
 
-            y_vals[p] = y_ik
             weights[p] = w
-
+            y_vals[p] = y_ik
+        sort_values_and_weights(y_vals, weights, 0,
+                                n_node_samples - 1)
         for p in range(start, end):
             sum_weights += weights[p]
 
-        sort_values_and_weights(y_vals, weights, 0, n_node_samples - 1)
         sum = sum_weights - weights[0]
 
         while(sum > sum_weights/2):
-            median_index +=1
+            median_index += 1
             sum -= weights[median_index]
 
         if sum == sum_weights/2:
@@ -97,12 +110,11 @@ cdef void compute_weighted_median(double* median_dest, double* y_vals,
             median_dest[k] = y_vals[median_index]
 
 
-
-cdef void sort_values_and_weights(double* y_vals, double* weights,
+cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,
                                   SIZE_t low, SIZE_t high) nogil:
     """Sort an array and its weights"""
     cdef SIZE_t pivot, i, j,
-    cdef double temp
+    cdef DOUBLE_t temp
     if low < high:
         pivot = low
         i = low
@@ -127,6 +139,7 @@ cdef void sort_values_and_weights(double* y_vals, double* weights,
         temp = weights[j]
         weights[j] = weights[pivot]
         weights[pivot] = temp
+
         sort_values_and_weights(y_vals, weights, low, j-1)
         sort_values_and_weights(y_vals, weights, j+1, high)
 

From ca5149aecef4440ecb699347ee679cea9da6bee4 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 19 May 2016 01:12:23 -0700
Subject: [PATCH 13/75] fix: fix segfault in median calculation and remove
 excessive logging

---
 sklearn/tree/_criterion.pyx | 13 -------------
 sklearn/tree/_splitter.pyx  |  5 -----
 sklearn/tree/_tree.pyx      |  4 ----
 sklearn/tree/_utils.pyx     | 20 ++++++++------------
 sklearn/tree/tree.py        |  1 -
 5 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index c31882fc7b838..7b2b33cba5768 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -39,11 +39,9 @@ cdef class Criterion:
 
     def __dealloc__(self):
         """Destructor."""
-        print "entered criterion dealloc"
         free(self.sum_total)
         free(self.sum_left)
         free(self.sum_right)
-        print "exited criterion dealloc"
 
     def __getstate__(self):
         return {}
@@ -274,7 +272,6 @@ cdef class ClassificationCriterion(Criterion):
 
     def __dealloc__(self):
         """Destructor."""
-        print "entered classificationcriterion dealloc"
         free(self.n_classes)
 
     def __reduce__(self):
@@ -980,8 +977,6 @@ cdef class MAE(RegressionCriterion):
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of
            samples[start:end]"""
-        with gil:
-            print "entered node_impurity"
         cdef double* medians = NULL
         medians = <DOUBLE_t *> calloc(self.n_outputs, sizeof(double))
         if (medians == NULL):
@@ -997,8 +992,6 @@ cdef class MAE(RegressionCriterion):
                 i = samples[p]
                 y_ik = self.y[i * self.y_stride + k]
                 impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples)
-        with gil:
-            print "impurity / self.n_outputs = {} / {} = {}".format(impurity, self.n_outputs, impurity / self.n_outputs)
         return impurity / self.n_outputs
 
     # cdef double proxy_impurity_improvement(self) nogil:
@@ -1052,12 +1045,6 @@ cdef class MAE(RegressionCriterion):
                 impurity_left[0] += fabs(y_ik - medians[k]) / (pos - start)
 
         impurity_right[0] = impurity_total - impurity_left[0]
-        with gil:
-            # print "start: {}".format(start)
-            # print "pos: {}".format(pos)
-            # print "end: {}".format(end)
-            print "impurity_left[0]: {}".format(impurity_left[0])
-            print "impurity_right[0]: {}".format(impurity_right[0])
 
 
 cdef class FriedmanMSE(MSE):
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index c2ea129a86f25..8d8bf1f985bf9 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -104,12 +104,10 @@ cdef class Splitter:
 
     def __dealloc__(self):
         """Destructor."""
-        print "entered splitter dealloc"
         free(self.samples)
         free(self.features)
         free(self.constant_features)
         free(self.feature_values)
-        print "exited splitter dealloc"
 
     def __getstate__(self):
         return {}
@@ -254,11 +252,9 @@ cdef class BaseDenseSplitter(Splitter):
         self.presort = presort
 
     def __dealloc__(self):
-        print "entered basedensesplitter dealloc"
         """Destructor."""
         if self.presort == 1:
             free(self.sample_mask)
-        print "exited basedensesplitter dealloc"
 
     cdef void init(self,
                    object X,
@@ -864,7 +860,6 @@ cdef class BaseSparseSplitter(Splitter):
         self.sorted_samples = NULL
 
     def __dealloc__(self):
-        print "entered basesparsesplitter dealloc"
         """Deallocate memory."""
         free(self.index_to_samples)
         free(self.sorted_samples)
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index d0f384589a626..1a3c5877e75ba 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -264,7 +264,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 tree.max_depth = max_depth_seen
         if rc == -1:
             raise MemoryError()
-        print "dont building"
 
 # Best first builder ----------------------------------------------------------
 
@@ -603,7 +602,6 @@ cdef class Tree:
 
     def __dealloc__(self):
         """Destructor."""
-        print "entered tree dealloc"
         # Free all inner structures
         free(self.n_classes)
         free(self.value)
@@ -799,7 +797,6 @@ cdef class Tree:
     cdef inline np.ndarray _apply_sparse_csr(self, object X):
         """Finds the terminal region (=leaf node) for each sample in sparse X.
         """
-        print "entered _apply_sparse_csr in tree.pyx"
         # Check input
         if not isinstance(X, csr_matrix):
             raise ValueError("X should be in csr_matrix format, got %s"
@@ -941,7 +938,6 @@ cdef class Tree:
 
     cdef inline object _decision_path_sparse_csr(self, object X):
         """Finds the decision path (=node) for each sample in X."""
-        print "entered _decision_path_sparse_csr in tree.pyx"
         # Check input
         if not isinstance(X, csr_matrix):
             raise ValueError("X should be in csr_matrix format, got %s"
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 5afbb1051924a..020812773630c 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -47,7 +47,6 @@ def _realloc_test():
     cdef SIZE_t* p = NULL
     safe_realloc(&p, <size_t>(-1) / 2)
     if p != NULL:
-        print "entered free in dealloc test"
         free(p)
         assert False
 
@@ -67,7 +66,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
     cdef SIZE_t i, p, k
     cdef DOUBLE_t sum_weights
     cdef SIZE_t median_index
-    cdef DOUBLE_t sum
+    cdef DOUBLE_t running_sum
 
     cdef DOUBLE_t* y_vals = NULL
     cdef DOUBLE_t* weights = NULL
@@ -80,11 +79,11 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
         with gil:
             raise MemoryError()
 
-
     for k in range(n_outputs):
         median_index = 0
+        # median_index = start
         sum_weights = 0.0
-        for p in range(start,end):
+        for p in range(0, n_node_samples):
             i = samples[p]
 
             y_ik = y[i * y_stride + k]
@@ -95,16 +94,16 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
             y_vals[p] = y_ik
         sort_values_and_weights(y_vals, weights, 0,
                                 n_node_samples - 1)
-        for p in range(start, end):
+        for p in range(0, n_node_samples):
             sum_weights += weights[p]
 
-        sum = sum_weights - weights[0]
+        running_sum = sum_weights - weights[0]
 
-        while(sum > sum_weights/2):
+        while(running_sum > sum_weights/2):
             median_index += 1
-            sum -= weights[median_index]
+            running_sum -= weights[median_index]
 
-        if sum == sum_weights/2:
+        if running_sum == sum_weights/2:
             median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
         else:
             median_dest[k] = y_vals[median_index]
@@ -206,9 +205,7 @@ cdef class Stack:
             raise MemoryError()
 
     def __dealloc__(self):
-        print "entered dealloc in stack"
         free(self.stack_)
-        print "exited dealloc in stack"
 
     cdef bint is_empty(self) nogil:
         return self.top <= 0
@@ -330,7 +327,6 @@ cdef class PriorityHeap:
             raise MemoryError()
 
     def __dealloc__(self):
-        print "entered dealloc in priorityheap"
         free(self.heap_)
 
     cdef bint is_empty(self) nogil:
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index d61fe54fa1198..a1e3f0cfecdbc 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -371,7 +371,6 @@ def fit(self, X, y, sample_weight=None, check_input=True,
         if self.n_outputs_ == 1:
             self.n_classes_ = self.n_classes_[0]
             self.classes_ = self.classes_[0]
-        print "done fitting"
         return self
 
     def _validate_X_predict(self, X, check_input):

From 1e5a969e9f0545765d2f2c4b6c29cab1a4744d09 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 19 May 2016 01:20:07 -0700
Subject: [PATCH 14/75] chore: revert some misc spacing changes I accidentally
 made

---
 sklearn/tree/_criterion.pyx | 4 ++--
 sklearn/tree/_tree.pyx      | 2 ++
 sklearn/tree/tree.py        | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 7b2b33cba5768..96af724d0680e 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -39,6 +39,7 @@ cdef class Criterion:
 
     def __dealloc__(self):
         """Destructor."""
+
         free(self.sum_total)
         free(self.sum_left)
         free(self.sum_right)
@@ -169,7 +170,6 @@ cdef class Criterion:
         return (- self.weighted_n_right * impurity_right
                 - self.weighted_n_left * impurity_left)
 
-
     cdef double impurity_improvement(self, double impurity) nogil:
         """Placeholder for improvement in impurity after a split.
 
@@ -272,6 +272,7 @@ cdef class ClassificationCriterion(Criterion):
 
     def __dealloc__(self):
         """Destructor."""
+
         free(self.n_classes)
 
     def __reduce__(self):
@@ -723,7 +724,6 @@ cdef class RegressionCriterion(Criterion):
         self.sum_left = <double*> calloc(n_outputs, sizeof(double))
         self.sum_right = <double*> calloc(n_outputs, sizeof(double))
 
-
         if (self.sum_total == NULL or
                 self.sum_left == NULL or
                 self.sum_right == NULL):
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 1a3c5877e75ba..f44320a7b47ae 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -265,6 +265,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         if rc == -1:
             raise MemoryError()
 
+
 # Best first builder ----------------------------------------------------------
 
 cdef inline int _add_to_frontier(PriorityHeapRecord* rec,
@@ -938,6 +939,7 @@ cdef class Tree:
 
     cdef inline object _decision_path_sparse_csr(self, object X):
         """Finds the decision path (=node) for each sample in X."""
+
         # Check input
         if not isinstance(X, csr_matrix):
             raise ValueError("X should be in csr_matrix format, got %s"
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index a1e3f0cfecdbc..4f8ebf9e960ed 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -371,6 +371,7 @@ def fit(self, X, y, sample_weight=None, check_input=True,
         if self.n_outputs_ == 1:
             self.n_classes_ = self.n_classes_[0]
             self.classes_ = self.classes_[0]
+
         return self
 
     def _validate_X_predict(self, X, check_input):

From 99132ace58900c34a7347e12e732edec2db65ab0 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 19 May 2016 01:21:34 -0700
Subject: [PATCH 15/75] chore: one last spacing fix in _splitter.pyx

---
 sklearn/tree/_splitter.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 8d8bf1f985bf9..0617508aab236 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -104,6 +104,7 @@ cdef class Splitter:
 
     def __dealloc__(self):
         """Destructor."""
+
         free(self.samples)
         free(self.features)
         free(self.constant_features)

From 9655fb071bdd6b4d93643748a2e1f73a527ef18b Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 19 May 2016 14:10:44 -0700
Subject: [PATCH 16/75] feature: don't calculate weighted median if no weights
 are passed in

---
 sklearn/tree/_utils.pyx | 54 ++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 020812773630c..c49584894b055 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -72,16 +72,18 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
     cdef DOUBLE_t* weights = NULL
     y_vals = <DOUBLE_t*> calloc(n_node_samples,
                                 sizeof(DOUBLE_t))
-    weights = <DOUBLE_t*> calloc(n_node_samples,
-                                 sizeof(DOUBLE_t))
+    if sample_weight != NULL:
+        with gil:
+            print "made weights arry"
+        weights = <DOUBLE_t*> calloc(n_node_samples,
+                                     sizeof(DOUBLE_t))
 
-    if (y_vals == NULL or weights == NULL):
+    if (y_vals == NULL or (weights == NULL and sample_weight != NULL)):
         with gil:
             raise MemoryError()
 
     for k in range(n_outputs):
         median_index = 0
-        # median_index = start
         sum_weights = 0.0
         for p in range(0, n_node_samples):
             i = samples[p]
@@ -89,25 +91,32 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
             y_ik = y[i * y_stride + k]
             if sample_weight != NULL:
                 w = sample_weight[i]
+                weights[p] = w
 
-            weights[p] = w
             y_vals[p] = y_ik
         sort_values_and_weights(y_vals, weights, 0,
                                 n_node_samples - 1)
-        for p in range(0, n_node_samples):
-            sum_weights += weights[p]
+        if sample_weight != NULL:
+            # calculate the weighted median
+            for p in range(0, n_node_samples):
+                sum_weights += weights[p]
 
-        running_sum = sum_weights - weights[0]
+            running_sum = sum_weights - weights[0]
 
-        while(running_sum > sum_weights/2):
-            median_index += 1
-            running_sum -= weights[median_index]
+            while(running_sum > sum_weights/2):
+                median_index += 1
+                running_sum -= weights[median_index]
 
-        if running_sum == sum_weights/2:
-            median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
+            if running_sum == sum_weights/2:
+                median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
+            else:
+                median_dest[k] = y_vals[median_index]
         else:
-            median_dest[k] = y_vals[median_index]
-
+            # calculate the unweighted median
+            if n_node_samples % 2 == 0:
+                median_dest[k] = (y_vals[n_node_samples / 2] +  y_vals[(n_node_samples / 2) - 1])/2
+            else:
+                median_dest[k] = y_vals[n_node_samples / 2]
 
 cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,
                                   SIZE_t low, SIZE_t high) nogil:
@@ -127,17 +136,18 @@ cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,
                 temp = y_vals[i]
                 y_vals[i] = y_vals[j]
                 y_vals[j] = temp
-
-                temp = weights[i]
-                weights[i] = weights[j]
-                weights[j] = temp
+                if weights != NULL:
+                    temp = weights[i]
+                    weights[i] = weights[j]
+                    weights[j] = temp
         temp = y_vals[j]
         y_vals[j] = y_vals[pivot]
         y_vals[pivot] = temp
 
-        temp = weights[j]
-        weights[j] = weights[pivot]
-        weights[pivot] = temp
+        if weights != NULL:
+            temp = weights[j]
+            weights[j] = weights[pivot]
+            weights[pivot] = temp
 
         sort_values_and_weights(y_vals, weights, low, j-1)
         sort_values_and_weights(y_vals, weights, j+1, high)

From e0476b97b439dfb5684fded4636b4a36c5852ea1 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 19 May 2016 17:30:07 -0700
Subject: [PATCH 17/75] remove extraneous logging statement

---
 sklearn/tree/_utils.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index c49584894b055..6f747967c6c49 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -72,9 +72,8 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
     cdef DOUBLE_t* weights = NULL
     y_vals = <DOUBLE_t*> calloc(n_node_samples,
                                 sizeof(DOUBLE_t))
+
     if sample_weight != NULL:
-        with gil:
-            print "made weights arry"
         weights = <DOUBLE_t*> calloc(n_node_samples,
                                      sizeof(DOUBLE_t))
 
@@ -85,6 +84,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
     for k in range(n_outputs):
         median_index = 0
         sum_weights = 0.0
+
         for p in range(0, n_node_samples):
             i = samples[p]
 
@@ -94,6 +94,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
                 weights[p] = w
 
             y_vals[p] = y_ik
+
         sort_values_and_weights(y_vals, weights, 0,
                                 n_node_samples - 1)
         if sample_weight != NULL:
@@ -118,6 +119,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
             else:
                 median_dest[k] = y_vals[n_node_samples / 2]
 
+
 cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,
                                   SIZE_t low, SIZE_t high) nogil:
     """Sort an array and its weights"""

From 04dfc7ee66c2e3aad9ec13990af57e3c1c5fa748 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 19 May 2016 20:34:33 -0700
Subject: [PATCH 18/75] fix: fix children impurity calculation

---
 sklearn/tree/_criterion.pyx | 20 ++++++++++++--------
 sklearn/tree/_utils.pyx     |  2 +-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 96af724d0680e..e42a08006af26 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -888,7 +888,6 @@ cdef class MSE(RegressionCriterion):
         impurity = self.sq_sum_total / self.weighted_n_node_samples
         for k in range(self.n_outputs):
             impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0
-
         return impurity / self.n_outputs
 
     cdef double proxy_impurity_improvement(self) nogil:
@@ -970,7 +969,6 @@ cdef class MAE(RegressionCriterion):
         """Computes the node value of samples[start:end] into dest."""
         cdef SIZE_t start = self.start
         cdef SIZE_t end = self.end
-
         compute_weighted_median(dest, start, end, self.sample_weight, self.y,
                                 self.samples, self.y_stride, self.n_outputs)
 
@@ -1021,8 +1019,6 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
 
-        cdef double impurity_total = self.node_impurity()
-
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
 
@@ -1032,8 +1028,7 @@ cdef class MAE(RegressionCriterion):
         if (medians == NULL):
             with gil:
                 raise MemoryError()
-        for k in range(self.n_outputs):
-            medians[k] = k
+
         compute_weighted_median(medians, start, pos, self.sample_weight,
                                 self.y, self.samples, self.y_stride,
                                 self.n_outputs)
@@ -1042,9 +1037,18 @@ cdef class MAE(RegressionCriterion):
             for p in range(start, pos):
                 i = samples[p]
                 y_ik = y[i * self.y_stride + k]
-                impurity_left[0] += fabs(y_ik - medians[k]) / (pos - start)
+                impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start))
+        impurity_left[0] /= self.n_outputs
 
-        impurity_right[0] = impurity_total - impurity_left[0]
+        compute_weighted_median(medians, pos, end, self.sample_weight,
+                                self.y, self.samples, self.y_stride,
+                                self.n_outputs)
+        for k in range(self.n_outputs):
+            for p in range(pos, end):
+                i = samples[p]
+                y_ik = y[i * self.y_stride + k]
+                impurity_right[0] += (fabs(y_ik - medians[k]) / (end - pos))
+        impurity_right[0] /= self.n_outputs
 
 
 cdef class FriedmanMSE(MSE):
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 6f747967c6c49..78998b36f39e3 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -86,7 +86,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
         sum_weights = 0.0
 
         for p in range(0, n_node_samples):
-            i = samples[p]
+            i = samples[p + start]
 
             y_ik = y[i * y_stride + k]
             if sample_weight != NULL:

From a61782f80ca61517fb5cbea5d7efb2385f8f0716 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 19 May 2016 23:46:25 -0700
Subject: [PATCH 19/75] fix: fix bug with children impurity not being initally
 set to 0

---
 sklearn/tree/_criterion.pyx | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index e42a08006af26..9e2f648c23f0e 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -990,6 +990,8 @@ cdef class MAE(RegressionCriterion):
                 i = samples[p]
                 y_ik = self.y[i * self.y_stride + k]
                 impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples)
+        with gil:
+            print impurity / self.n_outputs
         return impurity / self.n_outputs
 
     # cdef double proxy_impurity_improvement(self) nogil:
@@ -1021,6 +1023,7 @@ cdef class MAE(RegressionCriterion):
 
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
+        cdef double test
 
         cdef double* medians = NULL
 
@@ -1032,12 +1035,17 @@ cdef class MAE(RegressionCriterion):
         compute_weighted_median(medians, start, pos, self.sample_weight,
                                 self.y, self.samples, self.y_stride,
                                 self.n_outputs)
-
+        impurity_left[0] = 0.0
+        impurity_right[0] = 0.0
         for k in range(self.n_outputs):
             for p in range(start, pos):
                 i = samples[p]
                 y_ik = y[i * self.y_stride + k]
-                impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start))
+                test = (fabs(y_ik - medians[k]) / (pos-start))
+                # with gil:
+                # print test
+                # impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start))
+                impurity_left[0] += test
         impurity_left[0] /= self.n_outputs
 
         compute_weighted_median(medians, pos, end, self.sample_weight,

From 33af0fbbbaa0793c5f6e4ca051753065a1b702c7 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sat, 21 May 2016 13:38:41 -0700
Subject: [PATCH 20/75] fix: hacky fix for a float accuracy error

---
 sklearn/tree/_criterion.pyx | 23 ++++++++++-------------
 sklearn/tree/_splitter.pyx  |  8 +++++++-
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 9e2f648c23f0e..15177bcd7be95 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -166,7 +166,6 @@ cdef class Criterion:
         cdef double impurity_left
         cdef double impurity_right
         self.children_impurity(&impurity_left, &impurity_right)
-
         return (- self.weighted_n_right * impurity_right
                 - self.weighted_n_left * impurity_left)
 
@@ -198,7 +197,6 @@ cdef class Criterion:
         cdef double impurity_right
 
         self.children_impurity(&impurity_left, &impurity_right)
-
         return ((self.weighted_n_node_samples / self.weighted_n_samples) *
                 (impurity - (self.weighted_n_right /
                              self.weighted_n_node_samples * impurity_right)
@@ -821,7 +819,6 @@ cdef class RegressionCriterion(Criterion):
         # and that sum_total is known, we are going to update
         # sum_left from the direction that require the least amount
         # of computations, i.e. from pos to new_pos or from end to new_po.
-
         if (new_pos - pos) <= (end - new_pos):
             for p in range(pos, new_pos):
                 i = samples[p]
@@ -971,6 +968,7 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t end = self.end
         compute_weighted_median(dest, start, end, self.sample_weight, self.y,
                                 self.samples, self.y_stride, self.n_outputs)
+        cdef SIZE_t i
 
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of
@@ -990,8 +988,6 @@ cdef class MAE(RegressionCriterion):
                 i = samples[p]
                 y_ik = self.y[i * self.y_stride + k]
                 impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples)
-        with gil:
-            print impurity / self.n_outputs
         return impurity / self.n_outputs
 
     # cdef double proxy_impurity_improvement(self) nogil:
@@ -1023,7 +1019,6 @@ cdef class MAE(RegressionCriterion):
 
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
-        cdef double test
 
         cdef double* medians = NULL
 
@@ -1037,16 +1032,16 @@ cdef class MAE(RegressionCriterion):
                                 self.n_outputs)
         impurity_left[0] = 0.0
         impurity_right[0] = 0.0
+
         for k in range(self.n_outputs):
             for p in range(start, pos):
                 i = samples[p]
                 y_ik = y[i * self.y_stride + k]
-                test = (fabs(y_ik - medians[k]) / (pos-start))
-                # with gil:
-                # print test
                 # impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start))
-                impurity_left[0] += test
-        impurity_left[0] /= self.n_outputs
+                impurity_left[0] += fabs(y_ik - medians[k])
+        # impurity_left[0] /= self.n_outputs
+        impurity_left[0] /= ((pos - start) * self.n_outputs)
+
 
         compute_weighted_median(medians, pos, end, self.sample_weight,
                                 self.y, self.samples, self.y_stride,
@@ -1055,8 +1050,10 @@ cdef class MAE(RegressionCriterion):
             for p in range(pos, end):
                 i = samples[p]
                 y_ik = y[i * self.y_stride + k]
-                impurity_right[0] += (fabs(y_ik - medians[k]) / (end - pos))
-        impurity_right[0] /= self.n_outputs
+                # impurity_right[0] += (fabs(y_ik - medians[k]) / (end - pos))
+                impurity_right[0] += fabs(y_ik - medians[k])
+        # impurity_right[0] /= self.n_outputs
+        impurity_right[0] /= ((end - pos) * self.n_outputs)
 
 
 cdef class FriedmanMSE(MSE):
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 0617508aab236..31848546d8732 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -458,7 +458,6 @@ cdef class BestSplitter(BaseDenseSplitter):
                                 continue
 
                             current_proxy_improvement = self.criterion.proxy_impurity_improvement()
-
                             if current_proxy_improvement > best_proxy_improvement:
                                 best_proxy_improvement = current_proxy_improvement
                                 current.threshold = (Xf[p - 1] + Xf[p]) / 2.0
@@ -486,6 +485,7 @@ cdef class BestSplitter(BaseDenseSplitter):
                     samples[p] = tmp
 
             self.criterion.reset()
+
             self.criterion.update(best.pos)
             best.improvement = self.criterion.impurity_improvement(impurity)
             self.criterion.children_impurity(&best.impurity_left,
@@ -782,6 +782,7 @@ cdef class RandomSplitter(BaseDenseSplitter):
 
                     # Evaluate split
                     self.criterion.reset()
+
                     self.criterion.update(current.pos)
 
                     # Reject if min_weight_leaf is not satisfied
@@ -815,6 +816,7 @@ cdef class RandomSplitter(BaseDenseSplitter):
 
 
             self.criterion.reset()
+
             self.criterion.update(best.pos)
             best.improvement = self.criterion.impurity_improvement(impurity)
             self.criterion.children_impurity(&best.impurity_left,
@@ -1354,6 +1356,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
                                 best = current
 
         # Reorganize into samples[start:best.pos] + samples[best.pos:end]
+
         if best.pos < end:
             self.extract_nnz(best.feature, &end_negative, &start_positive,
                              &is_samples_sorted)
@@ -1362,6 +1365,7 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
                             best.pos)
 
             self.criterion.reset()
+
             self.criterion.update(best.pos)
             best.improvement = self.criterion.impurity_improvement(impurity)
             self.criterion.children_impurity(&best.impurity_left,
@@ -1563,6 +1567,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter):
 
                     # Evaluate split
                     self.criterion.reset()
+
                     self.criterion.update(current.pos)
 
                     # Reject if min_weight_leaf is not satisfied
@@ -1590,6 +1595,7 @@ cdef class RandomSparseSplitter(BaseSparseSplitter):
                                 best.pos)
 
             self.criterion.reset()
+
             self.criterion.update(best.pos)
             best.improvement = self.criterion.impurity_improvement(impurity)
             self.criterion.children_impurity(&best.impurity_left,

From 5844d810a29b5e83d50033431d3eddc6e895d0a5 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sat, 21 May 2016 22:44:38 -0700
Subject: [PATCH 21/75] fix: incorrect type cast in median array generation for
 node_impurity

---
 sklearn/tree/_criterion.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 15177bcd7be95..0b89753af45a7 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -974,7 +974,7 @@ cdef class MAE(RegressionCriterion):
         """Evaluate the impurity of the current node, i.e. the impurity of
            samples[start:end]"""
         cdef double* medians = NULL
-        medians = <DOUBLE_t *> calloc(self.n_outputs, sizeof(double))
+        medians = <double*> calloc(self.n_outputs, sizeof(double))
         if (medians == NULL):
             with gil:
                 raise MemoryError()

From 134eb9250ee015dbdd2516f33fe2c7a7c2ed69a9 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 22 May 2016 00:14:38 -0700
Subject: [PATCH 22/75] slightly tweak node_impurity function

---
 sklearn/tree/_criterion.pyx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 0b89753af45a7..e6453b9a4e1c4 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -968,7 +968,6 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t end = self.end
         compute_weighted_median(dest, start, end, self.sample_weight, self.y,
                                 self.samples, self.y_stride, self.n_outputs)
-        cdef SIZE_t i
 
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of
@@ -987,8 +986,10 @@ cdef class MAE(RegressionCriterion):
             for p in range(self.start, self.end):
                 i = samples[p]
                 y_ik = self.y[i * self.y_stride + k]
-                impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples)
-        return impurity / self.n_outputs
+                # impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples)
+                impurity += fabs(y_ik - medians[k])
+        # return impurity / self.n_outputs
+        return impurity / (self.weighted_n_node_samples * self.n_outputs)
 
     # cdef double proxy_impurity_improvement(self) nogil:
     #     """Compute a proxy of the impurity reduction

From 115df19f5cdfde4a14d435d19b1f37dc4f58e72b Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 22 May 2016 12:26:19 -0700
Subject: [PATCH 23/75] fix: be more explicit with casts

---
 sklearn/tree/_criterion.pyx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index e6453b9a4e1c4..adade6eba2fd9 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -987,7 +987,7 @@ cdef class MAE(RegressionCriterion):
                 i = samples[p]
                 y_ik = self.y[i * self.y_stride + k]
                 # impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples)
-                impurity += fabs(y_ik - medians[k])
+                impurity += <double> fabs((<double> y_ik) - medians[k])
         # return impurity / self.n_outputs
         return impurity / (self.weighted_n_node_samples * self.n_outputs)
 
@@ -1039,9 +1039,9 @@ cdef class MAE(RegressionCriterion):
                 i = samples[p]
                 y_ik = y[i * self.y_stride + k]
                 # impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start))
-                impurity_left[0] += fabs(y_ik - medians[k])
+                impurity_left[0] += <double>fabs((<double>y_ik) - medians[k])
         # impurity_left[0] /= self.n_outputs
-        impurity_left[0] /= ((pos - start) * self.n_outputs)
+        impurity_left[0] /= <double>((pos - start) * self.n_outputs)
 
 
         compute_weighted_median(medians, pos, end, self.sample_weight,
@@ -1052,9 +1052,9 @@ cdef class MAE(RegressionCriterion):
                 i = samples[p]
                 y_ik = y[i * self.y_stride + k]
                 # impurity_right[0] += (fabs(y_ik - medians[k]) / (end - pos))
-                impurity_right[0] += fabs(y_ik - medians[k])
+                impurity_right[0] += <double>fabs((<double>y_ik) - medians[k])
         # impurity_right[0] /= self.n_outputs
-        impurity_right[0] /= ((end - pos) * self.n_outputs)
+        impurity_right[0] /= <double>((end - pos) * self.n_outputs)
 
 
 cdef class FriedmanMSE(MSE):

From 6fa918fc0a7fe4f2a3d673ff69e5a2bf4fd069d1 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Fri, 27 May 2016 15:43:37 -0700
Subject: [PATCH 24/75] feature: revert cosmetic changes and free temporary
 arrays

---
 sklearn/tree/_criterion.pxd |  2 +-
 sklearn/tree/_criterion.pyx | 17 +++++++++++------
 sklearn/tree/_splitter.pyx  |  7 -------
 sklearn/tree/_utils.pyx     |  4 +++-
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 172d57659e6a6..889a623d732b3 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -44,7 +44,7 @@ cdef class Criterion:
                                     # weighted count of each label. For regression,
                                     # the sum of w*y. sum_total[k] is equal to
                                     # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k],
-                                    # where k is output index.
+                                    # where k is output index. 
     cdef double* sum_left           # Same as above, but for the left side of the split
     cdef double* sum_right          # same as above, but for the right side of the split
 
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index adade6eba2fd9..98451ff2c00b5 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -166,6 +166,7 @@ cdef class Criterion:
         cdef double impurity_left
         cdef double impurity_right
         self.children_impurity(&impurity_left, &impurity_right)
+
         return (- self.weighted_n_right * impurity_right
                 - self.weighted_n_left * impurity_left)
 
@@ -198,9 +199,9 @@ cdef class Criterion:
 
         self.children_impurity(&impurity_left, &impurity_right)
         return ((self.weighted_n_node_samples / self.weighted_n_samples) *
-                (impurity - (self.weighted_n_right /
+                (impurity - (self.weighted_n_right / 
                              self.weighted_n_node_samples * impurity_right)
-                          - (self.weighted_n_left /
+                          - (self.weighted_n_left / 
                              self.weighted_n_node_samples * impurity_left)))
 
 
@@ -263,7 +264,7 @@ cdef class ClassificationCriterion(Criterion):
         self.sum_left = <double*> calloc(n_elements, sizeof(double))
         self.sum_right = <double*> calloc(n_elements, sizeof(double))
 
-        if (self.sum_total == NULL or
+        if (self.sum_total == NULL or 
                 self.sum_left == NULL or
                 self.sum_right == NULL):
             raise MemoryError()
@@ -819,6 +820,7 @@ cdef class RegressionCriterion(Criterion):
         # and that sum_total is known, we are going to update
         # sum_left from the direction that require the least amount
         # of computations, i.e. from pos to new_pos or from end to new_po.
+
         if (new_pos - pos) <= (end - new_pos):
             for p in range(pos, new_pos):
                 i = samples[p]
@@ -846,7 +848,7 @@ cdef class RegressionCriterion(Criterion):
 
                 self.weighted_n_left -= w
 
-        self.weighted_n_right = (self.weighted_n_node_samples -
+        self.weighted_n_right = (self.weighted_n_node_samples - 
                                  self.weighted_n_left)
         for k in range(self.n_outputs):
             sum_right[k] = sum_total[k] - sum_left[k]
@@ -885,6 +887,7 @@ cdef class MSE(RegressionCriterion):
         impurity = self.sq_sum_total / self.weighted_n_node_samples
         for k in range(self.n_outputs):
             impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0
+
         return impurity / self.n_outputs
 
     cdef double proxy_impurity_improvement(self) nogil:
@@ -919,6 +922,7 @@ cdef class MSE(RegressionCriterion):
            left child (samples[start:pos]) and the impurity the right child
            (samples[pos:end])."""
 
+
         cdef DOUBLE_t* y = self.y
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
@@ -960,8 +964,7 @@ cdef class MSE(RegressionCriterion):
         impurity_right[0] /= self.n_outputs
 
 cdef class MAE(RegressionCriterion):
-    """Mean absolute error impurity criterion
-    """
+    """Mean absolute error impurity criterion"""
     cdef void node_value(self, double* dest) nogil:
         """Computes the node value of samples[start:end] into dest."""
         cdef SIZE_t start = self.start
@@ -989,6 +992,7 @@ cdef class MAE(RegressionCriterion):
                 # impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples)
                 impurity += <double> fabs((<double> y_ik) - medians[k])
         # return impurity / self.n_outputs
+        free(medians)
         return impurity / (self.weighted_n_node_samples * self.n_outputs)
 
     # cdef double proxy_impurity_improvement(self) nogil:
@@ -1055,6 +1059,7 @@ cdef class MAE(RegressionCriterion):
                 impurity_right[0] += <double>fabs((<double>y_ik) - medians[k])
         # impurity_right[0] /= self.n_outputs
         impurity_right[0] /= <double>((end - pos) * self.n_outputs)
+        free(medians)
 
 
 cdef class FriedmanMSE(MSE):
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 31848546d8732..39c17fc1ca55e 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -485,7 +485,6 @@ cdef class BestSplitter(BaseDenseSplitter):
                     samples[p] = tmp
 
             self.criterion.reset()
-
             self.criterion.update(best.pos)
             best.improvement = self.criterion.impurity_improvement(impurity)
             self.criterion.children_impurity(&best.impurity_left,
@@ -782,7 +781,6 @@ cdef class RandomSplitter(BaseDenseSplitter):
 
                     # Evaluate split
                     self.criterion.reset()
-
                     self.criterion.update(current.pos)
 
                     # Reject if min_weight_leaf is not satisfied
@@ -816,7 +814,6 @@ cdef class RandomSplitter(BaseDenseSplitter):
 
 
             self.criterion.reset()
-
             self.criterion.update(best.pos)
             best.improvement = self.criterion.impurity_improvement(impurity)
             self.criterion.children_impurity(&best.impurity_left,
@@ -1356,7 +1353,6 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
                                 best = current
 
         # Reorganize into samples[start:best.pos] + samples[best.pos:end]
-
         if best.pos < end:
             self.extract_nnz(best.feature, &end_negative, &start_positive,
                              &is_samples_sorted)
@@ -1365,7 +1361,6 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
                             best.pos)
 
             self.criterion.reset()
-
             self.criterion.update(best.pos)
             best.improvement = self.criterion.impurity_improvement(impurity)
             self.criterion.children_impurity(&best.impurity_left,
@@ -1567,7 +1562,6 @@ cdef class RandomSparseSplitter(BaseSparseSplitter):
 
                     # Evaluate split
                     self.criterion.reset()
-
                     self.criterion.update(current.pos)
 
                     # Reject if min_weight_leaf is not satisfied
@@ -1595,7 +1589,6 @@ cdef class RandomSparseSplitter(BaseSparseSplitter):
                                 best.pos)
 
             self.criterion.reset()
-
             self.criterion.update(best.pos)
             best.improvement = self.criterion.impurity_improvement(impurity)
             self.criterion.children_impurity(&best.impurity_left,
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 78998b36f39e3..bf6d8bccf99d0 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -85,7 +85,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
         median_index = 0
         sum_weights = 0.0
 
-        for p in range(0, n_node_samples):
+        for p in range(n_node_samples):
             i = samples[p + start]
 
             y_ik = y[i * y_stride + k]
@@ -118,6 +118,8 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
                 median_dest[k] = (y_vals[n_node_samples / 2] +  y_vals[(n_node_samples / 2) - 1])/2
             else:
                 median_dest[k] = y_vals[n_node_samples / 2]
+    free(y_vals)
+    free(weights)
 
 
 cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,

From 8d005941a3b8b9f0cb88398040bef805735ccfff Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Fri, 27 May 2016 15:54:53 -0700
Subject: [PATCH 25/75] fix: only free weight array in median calcuation if it
 was created

---
 sklearn/tree/_criterion.pyx | 7 ++++---
 sklearn/tree/_splitter.pyx  | 1 +
 sklearn/tree/_utils.pyx     | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 98451ff2c00b5..f715d6f5e4e50 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -198,6 +198,7 @@ cdef class Criterion:
         cdef double impurity_right
 
         self.children_impurity(&impurity_left, &impurity_right)
+
         return ((self.weighted_n_node_samples / self.weighted_n_samples) *
                 (impurity - (self.weighted_n_right / 
                              self.weighted_n_node_samples * impurity_right)
@@ -723,7 +724,7 @@ cdef class RegressionCriterion(Criterion):
         self.sum_left = <double*> calloc(n_outputs, sizeof(double))
         self.sum_right = <double*> calloc(n_outputs, sizeof(double))
 
-        if (self.sum_total == NULL or
+        if (self.sum_total == NULL or 
                 self.sum_left == NULL or
                 self.sum_right == NULL):
             raise MemoryError()
@@ -958,7 +959,7 @@ cdef class MSE(RegressionCriterion):
 
         for k in range(self.n_outputs):
             impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0
-            impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0
+            impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 
 
         impurity_left[0] /= self.n_outputs
         impurity_right[0] /= self.n_outputs
@@ -1118,5 +1119,5 @@ cdef class FriedmanMSE(MSE):
         diff = (self.weighted_n_right * total_sum_left -
                 self.weighted_n_left * total_sum_right) / self.n_outputs
 
-        return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
+        return (diff * diff / (self.weighted_n_left * self.weighted_n_right * 
                                self.weighted_n_node_samples))
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 39c17fc1ca55e..0617508aab236 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -458,6 +458,7 @@ cdef class BestSplitter(BaseDenseSplitter):
                                 continue
 
                             current_proxy_improvement = self.criterion.proxy_impurity_improvement()
+
                             if current_proxy_improvement > best_proxy_improvement:
                                 best_proxy_improvement = current_proxy_improvement
                                 current.threshold = (Xf[p - 1] + Xf[p]) / 2.0
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index bf6d8bccf99d0..8d6da1f4e82c8 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -119,7 +119,8 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
             else:
                 median_dest[k] = y_vals[n_node_samples / 2]
     free(y_vals)
-    free(weights)
+    if sample_weight != NULL:
+        free(weights)
 
 
 cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,

From d51e09184b661662a085aa8ea1c049826638b990 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Fri, 27 May 2016 18:49:42 -0700
Subject: [PATCH 26/75] style: remove extraneous newline / trigger CI build

---
 sklearn/tree/_utils.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 8d6da1f4e82c8..c2823ddbeeac6 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -16,7 +16,6 @@ from libc.stdlib cimport calloc
 from libc.stdlib cimport realloc
 from libc.math cimport log as ln
 
-
 import numpy as np
 cimport numpy as np
 np.import_array()

From a9ccf188a52e861b1c530c5c78970b172cce5a20 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Fri, 27 May 2016 18:54:37 -0700
Subject: [PATCH 27/75] style: remove extraneous 0 from range

---
 sklearn/tree/_utils.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index c2823ddbeeac6..843b3e460859a 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -85,7 +85,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
         sum_weights = 0.0
 
         for p in range(n_node_samples):
-            i = samples[p + start]
+            i = samples[start + p]
 
             y_ik = y[i * y_stride + k]
             if sample_weight != NULL:
@@ -98,7 +98,7 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
                                 n_node_samples - 1)
         if sample_weight != NULL:
             # calculate the weighted median
-            for p in range(0, n_node_samples):
+            for p in range(n_node_samples):
                 sum_weights += weights[p]
 
             running_sum = sum_weights - weights[0]

From f46b3c2fcd067fac73a105b1724cbc56fb12e07a Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 29 May 2016 03:46:44 -0700
Subject: [PATCH 28/75] feature: save sorts within a node to speed it up

---
 sklearn/tree/_criterion.pxd |  2 ++
 sklearn/tree/_criterion.pyx | 44 +++++++++++++++++++---------
 sklearn/tree/_utils.pxd     |  6 ++--
 sklearn/tree/_utils.pyx     | 58 ++++++++++---------------------------
 4 files changed, 51 insertions(+), 59 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 889a623d732b3..a6a4f10885b43 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -47,6 +47,8 @@ cdef class Criterion:
                                     # where k is output index. 
     cdef double* sum_left           # Same as above, but for the left side of the split
     cdef double* sum_right          # same as above, but for the right side of the split
+    cdef DOUBLE_t* coupled_sorted_y
+    cdef DOUBLE_t* coupled_sorted_weights
 
     # The criterion object is maintained such that left and right collected
     # statistics correspond to samples[start:pos] and samples[pos:end].
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index f715d6f5e4e50..479c2bc8b9794 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -29,6 +29,7 @@ from ._utils cimport log
 from ._utils cimport safe_realloc
 from ._utils cimport sizet_ptr_to_ndarray
 from ._utils cimport compute_weighted_median
+from ._utils cimport sort_values_and_weights
 
 cdef class Criterion:
     """Interface for impurity criteria.
@@ -43,6 +44,8 @@ cdef class Criterion:
         free(self.sum_total)
         free(self.sum_left)
         free(self.sum_right)
+        free(self.coupled_sorted_y)
+        free(self.coupled_sorted_weights)
 
     def __getstate__(self):
         return {}
@@ -718,6 +721,8 @@ cdef class RegressionCriterion(Criterion):
         self.sum_total = NULL
         self.sum_left = NULL
         self.sum_right = NULL
+        self.coupled_sorted_y = NULL
+        self.coupled_sorted_weights = NULL
 
         # Allocate memory for the accumulators
         self.sum_total = <double*> calloc(n_outputs, sizeof(double))
@@ -755,6 +760,16 @@ cdef class RegressionCriterion(Criterion):
         cdef DOUBLE_t w_y_ik
         cdef DOUBLE_t w = 1.0
 
+        self.coupled_sorted_y = <DOUBLE_t*> calloc(self.n_node_samples,
+                                                   sizeof(DOUBLE_t))
+        if sample_weight != NULL:
+            self.coupled_sorted_weights = <DOUBLE_t*> calloc(self.n_node_samples,
+                                                             sizeof(DOUBLE_t))
+        if(self.coupled_sorted_y == NULL or
+               (self.coupled_sorted_weights == NULL and sample_weight != NULL)):
+            with gil:
+                raise MemoryError()
+
         self.sq_sum_total = 0.0
         memset(self.sum_total, 0, self.n_outputs * sizeof(double))
 
@@ -763,15 +778,20 @@ cdef class RegressionCriterion(Criterion):
 
             if sample_weight != NULL:
                 w = sample_weight[i]
+                self.coupled_sorted_weights[p - start] = w
 
             for k in range(self.n_outputs):
                 y_ik = y[i * y_stride + k]
+                self.coupled_sorted_y[p - start] = y_ik
                 w_y_ik = w * y_ik
                 self.sum_total[k] += w_y_ik
                 self.sq_sum_total += w_y_ik * y_ik
 
             self.weighted_n_node_samples += w
 
+        sort_values_and_weights(self.coupled_sorted_y, self.coupled_sorted_weights,
+                                0, self.n_node_samples-1)
+
         # Reset to pos=start
         self.reset()
 
@@ -970,8 +990,10 @@ cdef class MAE(RegressionCriterion):
         """Computes the node value of samples[start:end] into dest."""
         cdef SIZE_t start = self.start
         cdef SIZE_t end = self.end
-        compute_weighted_median(dest, start, end, self.sample_weight, self.y,
-                                self.samples, self.y_stride, self.n_outputs)
+        compute_weighted_median(dest, 0, end-start,
+                                self.coupled_sorted_weights,
+                                self.coupled_sorted_y,
+                                self.n_outputs)
 
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of
@@ -990,9 +1012,7 @@ cdef class MAE(RegressionCriterion):
             for p in range(self.start, self.end):
                 i = samples[p]
                 y_ik = self.y[i * self.y_stride + k]
-                # impurity += (fabs(y_ik - medians[k]) / self.weighted_n_node_samples)
                 impurity += <double> fabs((<double> y_ik) - medians[k])
-        # return impurity / self.n_outputs
         free(medians)
         return impurity / (self.weighted_n_node_samples * self.n_outputs)
 
@@ -1033,8 +1053,9 @@ cdef class MAE(RegressionCriterion):
             with gil:
                 raise MemoryError()
 
-        compute_weighted_median(medians, start, pos, self.sample_weight,
-                                self.y, self.samples, self.y_stride,
+        compute_weighted_median(medians, 0, pos-start,
+                                self.coupled_sorted_weights,
+                                self.coupled_sorted_y,
                                 self.n_outputs)
         impurity_left[0] = 0.0
         impurity_right[0] = 0.0
@@ -1043,22 +1064,17 @@ cdef class MAE(RegressionCriterion):
             for p in range(start, pos):
                 i = samples[p]
                 y_ik = y[i * self.y_stride + k]
-                # impurity_left[0] += (fabs(y_ik - medians[k]) / (pos - start))
                 impurity_left[0] += <double>fabs((<double>y_ik) - medians[k])
-        # impurity_left[0] /= self.n_outputs
         impurity_left[0] /= <double>((pos - start) * self.n_outputs)
-
-
-        compute_weighted_median(medians, pos, end, self.sample_weight,
-                                self.y, self.samples, self.y_stride,
+        compute_weighted_median(medians, pos-start, end-start,
+                                self.coupled_sorted_weights,
+                                self.coupled_sorted_y,
                                 self.n_outputs)
         for k in range(self.n_outputs):
             for p in range(pos, end):
                 i = samples[p]
                 y_ik = y[i * self.y_stride + k]
-                # impurity_right[0] += (fabs(y_ik - medians[k]) / (end - pos))
                 impurity_right[0] += <double>fabs((<double>y_ik) - medians[k])
-        # impurity_right[0] /= self.n_outputs
         impurity_right[0] /= <double>((end - pos) * self.n_outputs)
         free(medians)
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 0678675ab1175..dbea4a9c60233 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -40,10 +40,12 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except *
 cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size)
 
 cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
-                                  DOUBLE_t* sample_weight, DOUBLE_t* y,
-                                  SIZE_t* samples, SIZE_t y_stride,
+                                  DOUBLE_t* coupled_sorted_weights,
+                                  DOUBLE_t* coupled_sorted_y,
                                   SIZE_t n_outputs) nogil
 
+cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,
+                                  SIZE_t low, SIZE_t high) nogil
 
 cdef SIZE_t rand_int(SIZE_t low, SIZE_t high,
                             UINT32_t* random_state) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 843b3e460859a..74e197e6f9293 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -51,75 +51,47 @@ def _realloc_test():
 
 
 cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
-                                  DOUBLE_t* sample_weight, DOUBLE_t* y,
-                                  SIZE_t* samples, SIZE_t y_stride,
+                                  DOUBLE_t* coupled_sorted_weights,
+                                  DOUBLE_t* coupled_sorted_y,
                                   SIZE_t n_outputs) nogil:
     """Calculate the weighted median of samples[start:end] and put
     it into a destination pointer
     given values, weights, and a start and end index.
     """
     cdef DOUBLE_t w = 1.0
-    cdef DOUBLE_t y_ik
-    cdef SIZE_t n_node_samples = end - start
-
-    cdef SIZE_t i, p, k
+    cdef SIZE_t p, k
     cdef DOUBLE_t sum_weights
     cdef SIZE_t median_index
     cdef DOUBLE_t running_sum
-
-    cdef DOUBLE_t* y_vals = NULL
-    cdef DOUBLE_t* weights = NULL
-    y_vals = <DOUBLE_t*> calloc(n_node_samples,
-                                sizeof(DOUBLE_t))
-
-    if sample_weight != NULL:
-        weights = <DOUBLE_t*> calloc(n_node_samples,
-                                     sizeof(DOUBLE_t))
-
-    if (y_vals == NULL or (weights == NULL and sample_weight != NULL)):
-        with gil:
-            raise MemoryError()
+    cdef SIZE_t n_node_samples = end - start
 
     for k in range(n_outputs):
         median_index = 0
         sum_weights = 0.0
 
-        for p in range(n_node_samples):
-            i = samples[start + p]
-
-            y_ik = y[i * y_stride + k]
-            if sample_weight != NULL:
-                w = sample_weight[i]
-                weights[p] = w
-
-            y_vals[p] = y_ik
-
-        sort_values_and_weights(y_vals, weights, 0,
-                                n_node_samples - 1)
-        if sample_weight != NULL:
+        if coupled_sorted_weights != NULL:
             # calculate the weighted median
             for p in range(n_node_samples):
-                sum_weights += weights[p]
+                sum_weights += coupled_sorted_weights[p]
 
-            running_sum = sum_weights - weights[0]
+            running_sum = sum_weights - coupled_sorted_weights[0]
 
             while(running_sum > sum_weights/2):
                 median_index += 1
-                running_sum -= weights[median_index]
+                running_sum -= coupled_sorted_weights[median_index]
 
             if running_sum == sum_weights/2:
-                median_dest[k] = (y_vals[median_index] + y_vals[median_index + 1]) / 2
+                median_dest[k] = (coupled_sorted_y[median_index] +
+                                  coupled_sorted_y[median_index + 1]) / 2
             else:
-                median_dest[k] = y_vals[median_index]
+                median_dest[k] = coupled_sorted_y[median_index]
         else:
             # calculate the unweighted median
-            if n_node_samples % 2 == 0:
-                median_dest[k] = (y_vals[n_node_samples / 2] +  y_vals[(n_node_samples / 2) - 1])/2
+            if (n_node_samples) % 2 == 0:
+                median_dest[k] = (coupled_sorted_y[(n_node_samples / 2) + start] +
+                                  coupled_sorted_y[(n_node_samples / 2) - 1 + start])/2
             else:
-                median_dest[k] = y_vals[n_node_samples / 2]
-    free(y_vals)
-    if sample_weight != NULL:
-        free(weights)
+                median_dest[k] = coupled_sorted_y[(n_node_samples / 2) + start]
 
 
 cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,

From 5635c979dbdc39d619c167ac45d0e2d6e1154f0e Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 29 May 2016 12:17:19 -0700
Subject: [PATCH 29/75] fix: move parts of dealloc to regression criterion

---
 sklearn/tree/_criterion.pxd | 6 ++++--
 sklearn/tree/_criterion.pyx | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index a6a4f10885b43..37e7f055b457a 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -47,8 +47,10 @@ cdef class Criterion:
                                     # where k is output index. 
     cdef double* sum_left           # Same as above, but for the left side of the split
     cdef double* sum_right          # same as above, but for the right side of the split
-    cdef DOUBLE_t* coupled_sorted_y
-    cdef DOUBLE_t* coupled_sorted_weights
+    cdef DOUBLE_t* coupled_sorted_y       # For MAE regression criteria, this stores the
+                                          # sorted y values
+    cdef DOUBLE_t* coupled_sorted_weights # For MAE regression criteria, this stores the
+                                          # weights corresponding to the sorted y values
 
     # The criterion object is maintained such that left and right collected
     # statistics correspond to samples[start:pos] and samples[pos:end].
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 479c2bc8b9794..3130a7353772c 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -44,8 +44,6 @@ cdef class Criterion:
         free(self.sum_total)
         free(self.sum_left)
         free(self.sum_right)
-        free(self.coupled_sorted_y)
-        free(self.coupled_sorted_weights)
 
     def __getstate__(self):
         return {}
@@ -686,6 +684,10 @@ cdef class RegressionCriterion(Criterion):
         var = \sum_i^n (y_i - y_bar) ** 2
             = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2
     """
+    def __dealloc__(self):
+        """Destructor."""
+        free(self.coupled_sorted_y)
+        free(self.coupled_sorted_weights)
 
     cdef double sq_sum_total
 

From 97d44e335d0d459d98def2273328afd26fd0d751 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 29 May 2016 12:37:46 -0700
Subject: [PATCH 30/75] chore: add comment to splitter to try to force
 recythonizing

---
 sklearn/tree/_splitter.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 0617508aab236..ff808c9c2639e 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -15,6 +15,8 @@
 #
 # License: BSD 3 clause
 
+# adding a new line to attempt to force recythonizing.
+
 from ._criterion cimport Criterion
 
 from libc.stdlib cimport free

From 58949f78d194dc07c9421e14ccbd1f4947ff5d4a Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 29 May 2016 12:46:36 -0700
Subject: [PATCH 31/75] chore: add comment to _tree.pyx to try to force
 recythonizing

---
 sklearn/tree/_tree.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index f44320a7b47ae..69697f36c10d4 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -15,6 +15,8 @@
 #
 # License: BSD 3 clause
 
+# adding a comment to try to force recythonization
+
 from cpython cimport Py_INCREF, PyObject
 
 from libc.stdlib cimport free

From 0be994f89911148f4ade7bd4b98cb81b0080fa37 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 29 May 2016 13:00:26 -0700
Subject: [PATCH 32/75] chore: add empty comment to gradient boosting to force
 recythonizing

---
 sklearn/ensemble/_gradient_boosting.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
index 9e6e9f6d29c0e..fdd670cf31bc4 100644
--- a/sklearn/ensemble/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_gradient_boosting.pyx
@@ -6,6 +6,8 @@
 #
 # License: BSD 3 clause
 
+# another empty comment to force recythonizing
+
 cimport cython
 
 import numpy as np

From 492ea7d590b2d3a3eb56229e38322d0c90823a38 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 29 May 2016 14:11:25 -0700
Subject: [PATCH 33/75] fix: fix bug in weighted median

---
 sklearn/tree/_criterion.pyx | 2 --
 sklearn/tree/_utils.pyx     | 9 ++++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 3130a7353772c..b7fe0f949c487 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1027,7 +1027,6 @@ cdef class MAE(RegressionCriterion):
     #     The absolute impurity improvement is only computed by the
     #     impurity_improvement method once the best split has been found.
     #     """
-
     #     # todo
     #     pass
 
@@ -1049,7 +1048,6 @@ cdef class MAE(RegressionCriterion):
         cdef DOUBLE_t y_ik
 
         cdef double* medians = NULL
-
         medians = <double*> calloc(self.n_outputs, sizeof(double))
         if (medians == NULL):
             with gil:
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 74e197e6f9293..616ad1c1652b7 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -12,7 +12,6 @@
 
 from libc.stdlib cimport free
 from libc.stdlib cimport malloc
-from libc.stdlib cimport calloc
 from libc.stdlib cimport realloc
 from libc.math cimport log as ln
 
@@ -81,15 +80,15 @@ cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
                 running_sum -= coupled_sorted_weights[median_index]
 
             if running_sum == sum_weights/2:
-                median_dest[k] = (coupled_sorted_y[median_index] +
-                                  coupled_sorted_y[median_index + 1]) / 2
+                median_dest[k] = (coupled_sorted_y[median_index + start] +
+                                  coupled_sorted_y[median_index + 1 + start]) / 2.0
             else:
-                median_dest[k] = coupled_sorted_y[median_index]
+                median_dest[k] = coupled_sorted_y[median_index + start]
         else:
             # calculate the unweighted median
             if (n_node_samples) % 2 == 0:
                 median_dest[k] = (coupled_sorted_y[(n_node_samples / 2) + start] +
-                                  coupled_sorted_y[(n_node_samples / 2) - 1 + start])/2
+                                  coupled_sorted_y[(n_node_samples / 2) - 1 + start]) / 2.0
             else:
                 median_dest[k] = coupled_sorted_y[(n_node_samples / 2) + start]
 

From 00fbe6efabae5c101dcbfcbc6fb87e187d77cecb Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 29 May 2016 14:48:27 -0700
Subject: [PATCH 34/75] try moving sorted values to a class variable

---
 sklearn/ensemble/_gradient_boosting.pyx | 2 --
 sklearn/tree/_criterion.pxd             | 4 ++--
 sklearn/tree/_criterion.pyx             | 2 ++
 sklearn/tree/_splitter.pyx              | 2 --
 sklearn/tree/_tree.pyx                  | 2 --
 5 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
index fdd670cf31bc4..9e6e9f6d29c0e 100644
--- a/sklearn/ensemble/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_gradient_boosting.pyx
@@ -6,8 +6,6 @@
 #
 # License: BSD 3 clause
 
-# another empty comment to force recythonizing
-
 cimport cython
 
 import numpy as np
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 37e7f055b457a..7320fb184d5e8 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -47,9 +47,9 @@ cdef class Criterion:
                                     # where k is output index. 
     cdef double* sum_left           # Same as above, but for the left side of the split
     cdef double* sum_right          # same as above, but for the right side of the split
-    cdef DOUBLE_t* coupled_sorted_y       # For MAE regression criteria, this stores the
+    # cdef DOUBLE_t* coupled_sorted_y       # For MAE regression criteria, this stores the
                                           # sorted y values
-    cdef DOUBLE_t* coupled_sorted_weights # For MAE regression criteria, this stores the
+    # cdef DOUBLE_t* coupled_sorted_weights # For MAE regression criteria, this stores the
                                           # weights corresponding to the sorted y values
 
     # The criterion object is maintained such that left and right collected
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index b7fe0f949c487..44eeb3350c1b2 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -690,6 +690,8 @@ cdef class RegressionCriterion(Criterion):
         free(self.coupled_sorted_weights)
 
     cdef double sq_sum_total
+    cdef DOUBLE_t* coupled_sorted_y
+    cdef DOUBLE_t* coupled_sorted_weights
 
     def __cinit__(self, SIZE_t n_outputs):
         """Initialize parameters for this criterion.
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index ff808c9c2639e..0617508aab236 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -15,8 +15,6 @@
 #
 # License: BSD 3 clause
 
-# adding a new line to attempt to force recythonizing.
-
 from ._criterion cimport Criterion
 
 from libc.stdlib cimport free
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 69697f36c10d4..f44320a7b47ae 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -15,8 +15,6 @@
 #
 # License: BSD 3 clause
 
-# adding a comment to try to force recythonization
-
 from cpython cimport Py_INCREF, PyObject
 
 from libc.stdlib cimport free

From f03cf38f92030a1908c0c99d602b703b30bd3478 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sat, 18 Jun 2016 13:56:45 -0700
Subject: [PATCH 35/75] feature: refactor criterion to sort once initially,
 then draw all samples from this sorted data

---
 sklearn/tree/_criterion.pyx | 74 ++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 39 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 44eeb3350c1b2..c95acf73cf60a 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -692,6 +692,7 @@ cdef class RegressionCriterion(Criterion):
     cdef double sq_sum_total
     cdef DOUBLE_t* coupled_sorted_y
     cdef DOUBLE_t* coupled_sorted_weights
+    cdef bint initialized
 
     def __cinit__(self, SIZE_t n_outputs):
         """Initialize parameters for this criterion.
@@ -714,6 +715,7 @@ cdef class RegressionCriterion(Criterion):
 
         self.n_outputs = n_outputs
         self.n_node_samples = 0
+        self.initialized = 0
         self.weighted_n_node_samples = 0.0
         self.weighted_n_left = 0.0
         self.weighted_n_right = 0.0
@@ -764,37 +766,49 @@ cdef class RegressionCriterion(Criterion):
         cdef DOUBLE_t w_y_ik
         cdef DOUBLE_t w = 1.0
 
-        self.coupled_sorted_y = <DOUBLE_t*> calloc(self.n_node_samples,
-                                                   sizeof(DOUBLE_t))
-        if sample_weight != NULL:
-            self.coupled_sorted_weights = <DOUBLE_t*> calloc(self.n_node_samples,
-                                                             sizeof(DOUBLE_t))
-        if(self.coupled_sorted_y == NULL or
-               (self.coupled_sorted_weights == NULL and sample_weight != NULL)):
-            with gil:
-                raise MemoryError()
-
         self.sq_sum_total = 0.0
         memset(self.sum_total, 0, self.n_outputs * sizeof(double))
 
+        if self.initialized == 0:
+            self.coupled_sorted_y = <DOUBLE_t*> calloc(self.n_node_samples,
+                                                       sizeof(DOUBLE_t))
+            if sample_weight != NULL:
+                self.coupled_sorted_weights = <DOUBLE_t*> calloc(self.n_node_samples,
+                                                                 sizeof(DOUBLE_t))
+            if(self.coupled_sorted_y == NULL or
+               (self.coupled_sorted_weights == NULL and sample_weight != NULL)):
+                with gil:
+                    raise MemoryError()
+
         for p in range(start, end):
             i = samples[p]
 
             if sample_weight != NULL:
                 w = sample_weight[i]
-                self.coupled_sorted_weights[p - start] = w
 
             for k in range(self.n_outputs):
                 y_ik = y[i * y_stride + k]
-                self.coupled_sorted_y[p - start] = y_ik
                 w_y_ik = w * y_ik
                 self.sum_total[k] += w_y_ik
                 self.sq_sum_total += w_y_ik * y_ik
 
             self.weighted_n_node_samples += w
 
-        sort_values_and_weights(self.coupled_sorted_y, self.coupled_sorted_weights,
-                                0, self.n_node_samples-1)
+        if self.initialized == 0:
+            for p in range(start, end):
+                i = samples[p]
+
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+                    self.coupled_sorted_weights[p - start] = w
+
+                for k in range(self.n_outputs):
+                    y_ik = y[i * y_stride + k]
+                    self.coupled_sorted_y[p - start] = y_ik
+
+            sort_values_and_weights(self.coupled_sorted_y, self.coupled_sorted_weights,
+                                    0, self.n_node_samples-1)
+            self.initialized = 1
 
         # Reset to pos=start
         self.reset()
@@ -821,7 +835,6 @@ cdef class RegressionCriterion(Criterion):
 
     cdef void update(self, SIZE_t new_pos) nogil:
         """Updated statistics by moving samples[pos:new_pos] to the left."""
-
         cdef double* sum_left = self.sum_left
         cdef double* sum_right = self.sum_right
         cdef double* sum_total = self.sum_total
@@ -912,7 +925,6 @@ cdef class MSE(RegressionCriterion):
         impurity = self.sq_sum_total / self.weighted_n_node_samples
         for k in range(self.n_outputs):
             impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0
-
         return impurity / self.n_outputs
 
     cdef double proxy_impurity_improvement(self) nogil:
@@ -937,7 +949,6 @@ cdef class MSE(RegressionCriterion):
         for k in range(self.n_outputs):
             proxy_impurity_left += sum_left[k] * sum_left[k]
             proxy_impurity_right += sum_right[k] * sum_right[k]
-
         return (proxy_impurity_left / self.weighted_n_left +
                 proxy_impurity_right / self.weighted_n_right)
 
@@ -994,7 +1005,7 @@ cdef class MAE(RegressionCriterion):
         """Computes the node value of samples[start:end] into dest."""
         cdef SIZE_t start = self.start
         cdef SIZE_t end = self.end
-        compute_weighted_median(dest, 0, end-start,
+        compute_weighted_median(dest, self.start, self.end,
                                 self.coupled_sorted_weights,
                                 self.coupled_sorted_y,
                                 self.n_outputs)
@@ -1014,31 +1025,18 @@ cdef class MAE(RegressionCriterion):
         self.node_value(medians)
         for k in range(self.n_outputs):
             for p in range(self.start, self.end):
-                i = samples[p]
-                y_ik = self.y[i * self.y_stride + k]
+                y_ik = self.coupled_sorted_y[p]
                 impurity += <double> fabs((<double> y_ik) - medians[k])
         free(medians)
         return impurity / (self.weighted_n_node_samples * self.n_outputs)
 
-    # cdef double proxy_impurity_improvement(self) nogil:
-    #     """Compute a proxy of the impurity reduction
-    #     This method is used to speed up the search for the best split.
-    #     It is a proxy quantity such that the split that maximizes this value
-    #     also maximizes the impurity improvement. It neglects all constant terms
-    #     of the impurity decrease for a given split.
-    #     The absolute impurity improvement is only computed by the
-    #     impurity_improvement method once the best split has been found.
-    #     """
-    #     # todo
-    #     pass
-
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) nogil:
         """Evaluate the impurity in children nodes, i.e. the impurity of the
            left child (samples[start:pos]) and the impurity the right child
            (samples[pos:end]).
         """
-        cdef DOUBLE_t* y = self.y
+        cdef DOUBLE_t* y = self.coupled_sorted_y
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
 
@@ -1055,7 +1053,7 @@ cdef class MAE(RegressionCriterion):
             with gil:
                 raise MemoryError()
 
-        compute_weighted_median(medians, 0, pos-start,
+        compute_weighted_median(medians, start, pos,
                                 self.coupled_sorted_weights,
                                 self.coupled_sorted_y,
                                 self.n_outputs)
@@ -1064,18 +1062,16 @@ cdef class MAE(RegressionCriterion):
 
         for k in range(self.n_outputs):
             for p in range(start, pos):
-                i = samples[p]
-                y_ik = y[i * self.y_stride + k]
+                y_ik = y[p]
                 impurity_left[0] += <double>fabs((<double>y_ik) - medians[k])
         impurity_left[0] /= <double>((pos - start) * self.n_outputs)
-        compute_weighted_median(medians, pos-start, end-start,
+        compute_weighted_median(medians, pos, end,
                                 self.coupled_sorted_weights,
                                 self.coupled_sorted_y,
                                 self.n_outputs)
         for k in range(self.n_outputs):
             for p in range(pos, end):
-                i = samples[p]
-                y_ik = y[i * self.y_stride + k]
+                y_ik = y[p]
                 impurity_right[0] += <double>fabs((<double>y_ik) - medians[k])
         impurity_right[0] /= <double>((end - pos) * self.n_outputs)
         free(medians)

From 2fdb56d5b799323393e5b023a0f534b2b1df0c53 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sat, 18 Jun 2016 13:58:14 -0700
Subject: [PATCH 36/75] style: remove extraneous parens from if condition

---
 sklearn/tree/_criterion.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index c95acf73cf60a..1e0655f103172 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1015,7 +1015,7 @@ cdef class MAE(RegressionCriterion):
            samples[start:end]"""
         cdef double* medians = NULL
         medians = <double*> calloc(self.n_outputs, sizeof(double))
-        if (medians == NULL):
+        if medians == NULL:
             with gil:
                 raise MemoryError()
         cdef double impurity = 0.0

From b9aef433c7682b5c65369cf2dd3fcdc3d9ce67c4 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 22 Jun 2016 07:48:20 -0700
Subject: [PATCH 37/75] implement median-heap method for calculating impurity

---
 sklearn/tree/_criterion.pxd |   4 -
 sklearn/tree/_criterion.pyx | 346 +++++++++++++++++++------
 sklearn/tree/_utils.pxd     |  52 +++-
 sklearn/tree/_utils.pyx     | 487 +++++++++++++++++++++++++++---------
 4 files changed, 682 insertions(+), 207 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 7320fb184d5e8..889a623d732b3 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -47,10 +47,6 @@ cdef class Criterion:
                                     # where k is output index. 
     cdef double* sum_left           # Same as above, but for the left side of the split
     cdef double* sum_right          # same as above, but for the right side of the split
-    # cdef DOUBLE_t* coupled_sorted_y       # For MAE regression criteria, this stores the
-                                          # sorted y values
-    # cdef DOUBLE_t* coupled_sorted_weights # For MAE regression criteria, this stores the
-                                          # weights corresponding to the sorted y values
 
     # The criterion object is maintained such that left and right collected
     # statistics correspond to samples[start:pos] and samples[pos:end].
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 1e0655f103172..929f6b7af4b0c 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -12,6 +12,7 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Fares Hedayati <fares.hedayati@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Nelson Liu <nelson@nelsonliu.me>
 #
 # License: BSD 3 clause
 
@@ -28,8 +29,8 @@ np.import_array()
 from ._utils cimport log
 from ._utils cimport safe_realloc
 from ._utils cimport sizet_ptr_to_ndarray
-from ._utils cimport compute_weighted_median
-from ._utils cimport sort_values_and_weights
+from ._utils cimport MedianHeap
+from ._utils cimport MinMaxHeapRecord
 
 cdef class Criterion:
     """Interface for impurity criteria.
@@ -684,15 +685,7 @@ cdef class RegressionCriterion(Criterion):
         var = \sum_i^n (y_i - y_bar) ** 2
             = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2
     """
-    def __dealloc__(self):
-        """Destructor."""
-        free(self.coupled_sorted_y)
-        free(self.coupled_sorted_weights)
-
     cdef double sq_sum_total
-    cdef DOUBLE_t* coupled_sorted_y
-    cdef DOUBLE_t* coupled_sorted_weights
-    cdef bint initialized
 
     def __cinit__(self, SIZE_t n_outputs):
         """Initialize parameters for this criterion.
@@ -703,7 +696,6 @@ cdef class RegressionCriterion(Criterion):
             The number of targets to be predicted
         """
 
-        # Default values
         self.y = NULL
         self.y_stride = 0
         self.sample_weight = NULL
@@ -715,7 +707,6 @@ cdef class RegressionCriterion(Criterion):
 
         self.n_outputs = n_outputs
         self.n_node_samples = 0
-        self.initialized = 0
         self.weighted_n_node_samples = 0.0
         self.weighted_n_left = 0.0
         self.weighted_n_right = 0.0
@@ -727,8 +718,6 @@ cdef class RegressionCriterion(Criterion):
         self.sum_total = NULL
         self.sum_left = NULL
         self.sum_right = NULL
-        self.coupled_sorted_y = NULL
-        self.coupled_sorted_weights = NULL
 
         # Allocate memory for the accumulators
         self.sum_total = <double*> calloc(n_outputs, sizeof(double))
@@ -769,17 +758,6 @@ cdef class RegressionCriterion(Criterion):
         self.sq_sum_total = 0.0
         memset(self.sum_total, 0, self.n_outputs * sizeof(double))
 
-        if self.initialized == 0:
-            self.coupled_sorted_y = <DOUBLE_t*> calloc(self.n_node_samples,
-                                                       sizeof(DOUBLE_t))
-            if sample_weight != NULL:
-                self.coupled_sorted_weights = <DOUBLE_t*> calloc(self.n_node_samples,
-                                                                 sizeof(DOUBLE_t))
-            if(self.coupled_sorted_y == NULL or
-               (self.coupled_sorted_weights == NULL and sample_weight != NULL)):
-                with gil:
-                    raise MemoryError()
-
         for p in range(start, end):
             i = samples[p]
 
@@ -793,29 +771,12 @@ cdef class RegressionCriterion(Criterion):
                 self.sq_sum_total += w_y_ik * y_ik
 
             self.weighted_n_node_samples += w
-
-        if self.initialized == 0:
-            for p in range(start, end):
-                i = samples[p]
-
-                if sample_weight != NULL:
-                    w = sample_weight[i]
-                    self.coupled_sorted_weights[p - start] = w
-
-                for k in range(self.n_outputs):
-                    y_ik = y[i * y_stride + k]
-                    self.coupled_sorted_y[p - start] = y_ik
-
-            sort_values_and_weights(self.coupled_sorted_y, self.coupled_sorted_weights,
-                                    0, self.n_node_samples-1)
-            self.initialized = 1
-
-        # Reset to pos=start
         self.reset()
 
     cdef void reset(self) nogil:
         """Reset the criterion at pos=start."""
         cdef SIZE_t n_bytes = self.n_outputs * sizeof(double)
+
         memset(self.sum_left, 0, n_bytes)
         memcpy(self.sum_right, self.sum_total, n_bytes)
 
@@ -823,6 +784,7 @@ cdef class RegressionCriterion(Criterion):
         self.weighted_n_right = self.weighted_n_node_samples
         self.pos = self.start
 
+
     cdef void reverse_reset(self) nogil:
         """Reset the criterion at pos=end."""
         cdef SIZE_t n_bytes = self.n_outputs * sizeof(double)
@@ -850,6 +812,7 @@ cdef class RegressionCriterion(Criterion):
         cdef SIZE_t k
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t y_ik
+        cdef DOUBLE_t w_y_ik
 
         # Update statistics up to new_pos
         #
@@ -868,7 +831,8 @@ cdef class RegressionCriterion(Criterion):
 
                 for k in range(self.n_outputs):
                     y_ik = y[i * self.y_stride + k]
-                    sum_left[k] += w * y_ik
+                    w_y_ik = w * y_ik
+                    sum_left[k] += w_y_ik
 
                 self.weighted_n_left += w
         else:
@@ -882,7 +846,8 @@ cdef class RegressionCriterion(Criterion):
 
                 for k in range(self.n_outputs):
                     y_ik = y[i * self.y_stride + k]
-                    sum_left[k] -= w * y_ik
+                    w_y_ik = w * y_ik
+                    sum_left[k] -= w_y_ik
 
                 self.weighted_n_left -= w
 
@@ -1001,33 +966,248 @@ cdef class MSE(RegressionCriterion):
 
 cdef class MAE(RegressionCriterion):
     """Mean absolute error impurity criterion"""
+    def __dealloc__(self):
+        """Destructor."""
+
+        free(self.node_medians)
+
+    cdef np.ndarray left_child_heaps
+    cdef np.ndarray right_child_heaps
+    cdef double* node_medians
+
+    def __cinit__(self, SIZE_t n_outputs):
+        """Initialize parameters for this criterion.
+
+        Parameters
+        ----------
+        n_outputs: SIZE_t
+            The number of targets to be predicted
+        """
+
+        # Default values
+        self.y = NULL
+        self.y_stride = 0
+        self.sample_weight = NULL
+
+        self.samples = NULL
+        self.start = 0
+        self.pos = 0
+        self.end = 0
+
+        self.n_outputs = n_outputs
+        self.n_node_samples = 0
+        self.weighted_n_node_samples = 0.0
+        self.weighted_n_left = 0.0
+        self.weighted_n_right = 0.0
+
+        # Allocate accumulators. Make sure they are NULL, not uninitialized,
+        # before an exception can be raised (which triggers __dealloc__).
+        self.node_medians = NULL
+
+        # Allocate memory for the accumulators
+        self.node_medians = <double*> calloc(n_outputs, sizeof(double))
+
+        if (self.node_medians == NULL):
+            raise MemoryError()
+
+        self.left_child_heaps = np.empty(n_outputs, dtype='object')
+        self.right_child_heaps = np.empty(n_outputs, dtype='object')
+
+    cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
+                   double weighted_n_samples, SIZE_t* samples, SIZE_t start,
+                   SIZE_t end) nogil:
+        """Initialize the criterion at node samples[start:end] and
+           children samples[start:start] and samples[start:end]."""
+
+        # Initialize fields
+        self.y = y
+        self.y_stride = y_stride
+        self.sample_weight = sample_weight
+        self.samples = samples
+        self.start = start
+        self.end = end
+        self.n_node_samples = end - start
+        self.weighted_n_samples = weighted_n_samples
+        self.weighted_n_node_samples = 0.
+
+        cdef SIZE_t i
+        cdef SIZE_t p
+        cdef SIZE_t k
+        cdef DOUBLE_t y_ik
+        cdef DOUBLE_t w_y_ik
+        cdef DOUBLE_t w = 1.0
+
+        # Fill accumulators with MedianHeaps
+        with gil:
+            for k in range(self.n_outputs):
+                self.left_child_heaps[k] = MedianHeap(self.n_node_samples)
+                self.right_child_heaps[k] = MedianHeap(self.n_node_samples)
+
+        cdef void** left_child_heaps = <void**> self.left_child_heaps.data
+        cdef void** right_child_heaps = <void**> self.right_child_heaps.data
+
+        for p in range(start, end):
+            i = samples[p]
+
+            if sample_weight != NULL:
+                w = sample_weight[i]
+
+            for k in range(self.n_outputs):
+                y_ik = y[i * y_stride + k]
+                w_y_ik = w * y_ik
+
+                # push all values to the right side,
+                # since pos = start initially anyway
+                (<MedianHeap> right_child_heaps[k]).push(w_y_ik)
+
+            self.weighted_n_node_samples += w
+        # calculate the node medians
+        for k in range(self.n_outputs):
+            (<MedianHeap> right_child_heaps[k]).get_median(&(self.node_medians[k]))
+
+        # Reset to pos=start
+        self.reset()
+
+    cdef void reset(self) nogil:
+        """Reset the criterion at pos=start."""
+
+        cdef SIZE_t i
+        cdef SIZE_t k
+        cdef DOUBLE_t popped
+
+        cdef void** left_child_heaps = <void**> self.left_child_heaps.data
+        cdef void** right_child_heaps = <void**> self.right_child_heaps.data
+
+        self.weighted_n_left = 0.0
+        self.weighted_n_right = self.weighted_n_node_samples
+        self.pos = self.start
+
+        # reset the medianheaps, left should have no elements and
+        # right should have all elements.
+
+        for k in range(self.n_outputs):
+            # if left has no elements, it's already reset
+            for i in range((<MedianHeap> left_child_heaps[k]).size()):
+                # remove everything from left and put it into right
+                (<MedianHeap> left_child_heaps[k]).pop(&popped)
+                (<MedianHeap> right_child_heaps[k]).push(popped)
+
+    cdef void reverse_reset(self) nogil:
+        """Reset the criterion at pos=end."""
+
+        self.weighted_n_right = 0.0
+        self.weighted_n_left = self.weighted_n_node_samples
+        self.pos = self.end
+
+        cdef DOUBLE_t popped
+        cdef void** left_child_heaps = <void**> self.left_child_heaps.data
+        cdef void** right_child_heaps = <void**> self.right_child_heaps.data
+
+        # reverse_reset the medianheaps, right should have no elements and
+        # left should have all elements.
+        for k in range(self.n_outputs):
+            # if right has no elements, it's already reset
+            for i in range((<MedianHeap> right_child_heaps[k]).size()):
+                # remove everything from right and put it into left
+                (<MedianHeap> right_child_heaps[k]).pop(&popped)
+                (<MedianHeap> left_child_heaps[k]).push(popped)
+
+    cdef void update(self, SIZE_t new_pos) nogil:
+        """Updated statistics by moving samples[pos:new_pos] to the left."""
+
+        cdef double* sample_weight = self.sample_weight
+        cdef SIZE_t* samples = self.samples
+
+        cdef void** left_child_heaps = <void**> self.left_child_heaps.data
+        cdef void** right_child_heaps = <void**> self.right_child_heaps.data
+
+        cdef DOUBLE_t* y = self.y
+        cdef SIZE_t pos = self.pos
+        cdef SIZE_t end = self.end
+        cdef SIZE_t i
+        cdef SIZE_t p
+        cdef SIZE_t k
+        cdef DOUBLE_t w = 1.0
+        cdef DOUBLE_t y_ik
+        cdef DOUBLE_t w_y_ik
+
+        # Update statistics up to new_pos
+        #
+        # We are going to update right_child_heaps and left_child_heaps
+        # from the direction that require the least amount of
+        # computations, i.e. from pos to new_pos or from end to new_pos.
+
+        if (new_pos - pos) <= (end - new_pos):
+            for p in range(pos, new_pos):
+                i = samples[p]
+
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    y_ik = y[i * self.y_stride + k]
+                    w_y_ik = w * y_ik
+
+                    # remove w_y_ik from right and add to left
+                    (<MedianHeap> right_child_heaps[k]).remove(w_y_ik)
+                    (<MedianHeap> left_child_heaps[k]).push(w_y_ik)
+
+                self.weighted_n_left += w
+        else:
+            self.reverse_reset()
+
+            for p in range(end - 1, new_pos - 1, -1):
+                i = samples[p]
+
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    y_ik = y[i * self.y_stride + k]
+                    w_y_ik = w * y_ik
+
+                    # remove w_y_ik from left and add to right
+                    (<MedianHeap> left_child_heaps[k]).remove(w_y_ik)
+                    (<MedianHeap> right_child_heaps[k]).push(w_y_ik)
+
+                self.weighted_n_left -= w
+
+        self.weighted_n_right = (self.weighted_n_node_samples - 
+                                 self.weighted_n_left)
+        self.pos = new_pos
+
     cdef void node_value(self, double* dest) nogil:
         """Computes the node value of samples[start:end] into dest."""
-        cdef SIZE_t start = self.start
-        cdef SIZE_t end = self.end
-        compute_weighted_median(dest, self.start, self.end,
-                                self.coupled_sorted_weights,
-                                self.coupled_sorted_y,
-                                self.n_outputs)
+
+        cdef SIZE_t k
+        for k in range(self.n_outputs):
+            dest[k] = self.node_medians[k]
 
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of
            samples[start:end]"""
-        cdef double* medians = NULL
-        medians = <double*> calloc(self.n_outputs, sizeof(double))
-        if medians == NULL:
-            with gil:
-                raise MemoryError()
-        cdef double impurity = 0.0
+
+        cdef DOUBLE_t* y = self.y
+        cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
         cdef SIZE_t i, p, k
+        cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t y_ik
-        self.node_value(medians)
+        cdef DOUBLE_t w_y_ik
+
+        cdef double impurity = 0.0
+
         for k in range(self.n_outputs):
             for p in range(self.start, self.end):
-                y_ik = self.coupled_sorted_y[p]
-                impurity += <double> fabs((<double> y_ik) - medians[k])
-        free(medians)
+                i = samples[p]
+
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                y_ik = y[i * self.y_stride + k]
+                w_y_ik = w * y_ik
+
+                impurity += <double> fabs((<double> w_y_ik) - self.node_medians[k])
         return impurity / (self.weighted_n_node_samples * self.n_outputs)
 
     cdef void children_impurity(self, double* impurity_left,
@@ -1036,7 +1216,7 @@ cdef class MAE(RegressionCriterion):
            left child (samples[start:pos]) and the impurity the right child
            (samples[pos:end]).
         """
-        cdef DOUBLE_t* y = self.coupled_sorted_y
+        cdef DOUBLE_t* y = self.y
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
 
@@ -1046,35 +1226,41 @@ cdef class MAE(RegressionCriterion):
 
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
+        cdef DOUBLE_t w = 1.0
+        cdef DOUBLE_t w_y_ik
+        cdef DOUBLE_t median
 
-        cdef double* medians = NULL
-        medians = <double*> calloc(self.n_outputs, sizeof(double))
-        if (medians == NULL):
-            with gil:
-                raise MemoryError()
+        cdef void** left_child_heaps = <void**> self.left_child_heaps.data
+        cdef void** right_child_heaps = <void**> self.right_child_heaps.data
 
-        compute_weighted_median(medians, start, pos,
-                                self.coupled_sorted_weights,
-                                self.coupled_sorted_y,
-                                self.n_outputs)
         impurity_left[0] = 0.0
         impurity_right[0] = 0.0
 
         for k in range(self.n_outputs):
+            (<MedianHeap> left_child_heaps[k]).get_median(&median)
             for p in range(start, pos):
-                y_ik = y[p]
-                impurity_left[0] += <double>fabs((<double>y_ik) - medians[k])
+                i = samples[p]
+
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                y_ik = y[i * self.y_stride + k]
+                w_y_ik = w * y_ik
+                impurity_left[0] += <double>fabs((<double> w_y_ik) - median)
         impurity_left[0] /= <double>((pos - start) * self.n_outputs)
-        compute_weighted_median(medians, pos, end,
-                                self.coupled_sorted_weights,
-                                self.coupled_sorted_y,
-                                self.n_outputs)
+
         for k in range(self.n_outputs):
+            (<MedianHeap> right_child_heaps[k]).get_median(&median)
             for p in range(pos, end):
-                y_ik = y[p]
-                impurity_right[0] += <double>fabs((<double>y_ik) - medians[k])
+                i = samples[p]
+
+                if sample_weight != NULL:
+                    w = sample_weight[i]
+
+                y_ik = y[i * self.y_stride + k]
+                w_y_ik = w * y_ik
+                impurity_right[0] += <double>fabs((<double> w_y_ik) - median)
         impurity_right[0] /= <double>((end - pos) * self.n_outputs)
-        free(medians)
 
 
 cdef class FriedmanMSE(MSE):
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index dbea4a9c60233..90482c4e11c01 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -39,14 +39,6 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except *
 
 cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size)
 
-cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
-                                  DOUBLE_t* coupled_sorted_weights,
-                                  DOUBLE_t* coupled_sorted_y,
-                                  SIZE_t n_outputs) nogil
-
-cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,
-                                  SIZE_t low, SIZE_t high) nogil
-
 cdef SIZE_t rand_int(SIZE_t low, SIZE_t high,
                             UINT32_t* random_state) nogil
 
@@ -111,3 +103,47 @@ cdef class PriorityHeap:
                   double impurity, double impurity_left,
                   double impurity_right) nogil
     cdef int pop(self, PriorityHeapRecord* res) nogil
+    cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil
+    cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos,
+                           SIZE_t heap_length) nogil
+
+# =============================================================================
+# MinMaxHeap data structure
+# =============================================================================
+
+# A record stored in the MinMaxHeap
+cdef struct MinMaxHeapRecord:
+    DOUBLE_t data
+
+cdef class MinMaxHeap:
+    cdef SIZE_t capacity
+    cdef SIZE_t heap_ptr
+    cdef MinMaxHeapRecord* heap_
+    cdef bint mode
+
+    cdef void heapify_up(self, MinMaxHeapRecord* heap, SIZE_t pos) nogil
+    cdef void heapify_down(self, MinMaxHeapRecord* heap, SIZE_t pos,
+                           SIZE_t heap_length) nogil
+    cdef bint is_empty(self) nogil
+    cdef SIZE_t size(self) nogil
+    cdef int push(self, DOUBLE_t data) nogil
+    cdef int remove(self, DOUBLE_t value) nogil
+    cdef int pop(self, DOUBLE_t* res) nogil
+    cdef int peek(self, DOUBLE_t* res) nogil
+
+# =============================================================================
+# MedianHeap data structure
+# =============================================================================
+
+cdef class MedianHeap:
+    cdef SIZE_t initial_capacity
+    cdef SIZE_t current_capacity
+    cdef MinMaxHeap right_min_heap
+    cdef MinMaxHeap left_max_heap
+
+    cdef SIZE_t size(self) nogil
+    cdef int pop(self, DOUBLE_t* res) nogil
+    cdef int push(self, DOUBLE_t data) nogil
+    cdef int remove(self, DOUBLE_t data) nogil
+    cdef int get_median(self, DOUBLE_t* data) nogil
+    cdef int rebalance(self) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 616ad1c1652b7..b1a01a289d5d3 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -6,12 +6,14 @@
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Nelson Liu <nelson@nelsonliu.me>
 #
 #
 # License: BSD 3 clause
 
 from libc.stdlib cimport free
 from libc.stdlib cimport malloc
+from libc.stdlib cimport calloc
 from libc.stdlib cimport realloc
 from libc.math cimport log as ln
 
@@ -49,85 +51,6 @@ def _realloc_test():
         assert False
 
 
-cdef void compute_weighted_median(double* median_dest, SIZE_t start, SIZE_t end,
-                                  DOUBLE_t* coupled_sorted_weights,
-                                  DOUBLE_t* coupled_sorted_y,
-                                  SIZE_t n_outputs) nogil:
-    """Calculate the weighted median of samples[start:end] and put
-    it into a destination pointer
-    given values, weights, and a start and end index.
-    """
-    cdef DOUBLE_t w = 1.0
-    cdef SIZE_t p, k
-    cdef DOUBLE_t sum_weights
-    cdef SIZE_t median_index
-    cdef DOUBLE_t running_sum
-    cdef SIZE_t n_node_samples = end - start
-
-    for k in range(n_outputs):
-        median_index = 0
-        sum_weights = 0.0
-
-        if coupled_sorted_weights != NULL:
-            # calculate the weighted median
-            for p in range(n_node_samples):
-                sum_weights += coupled_sorted_weights[p]
-
-            running_sum = sum_weights - coupled_sorted_weights[0]
-
-            while(running_sum > sum_weights/2):
-                median_index += 1
-                running_sum -= coupled_sorted_weights[median_index]
-
-            if running_sum == sum_weights/2:
-                median_dest[k] = (coupled_sorted_y[median_index + start] +
-                                  coupled_sorted_y[median_index + 1 + start]) / 2.0
-            else:
-                median_dest[k] = coupled_sorted_y[median_index + start]
-        else:
-            # calculate the unweighted median
-            if (n_node_samples) % 2 == 0:
-                median_dest[k] = (coupled_sorted_y[(n_node_samples / 2) + start] +
-                                  coupled_sorted_y[(n_node_samples / 2) - 1 + start]) / 2.0
-            else:
-                median_dest[k] = coupled_sorted_y[(n_node_samples / 2) + start]
-
-
-cdef void sort_values_and_weights(DOUBLE_t* y_vals, DOUBLE_t* weights,
-                                  SIZE_t low, SIZE_t high) nogil:
-    """Sort an array and its weights"""
-    cdef SIZE_t pivot, i, j,
-    cdef DOUBLE_t temp
-    if low < high:
-        pivot = low
-        i = low
-        j = high
-        while i < j:
-            while(y_vals[i] <= y_vals[pivot] and i <= high):
-                i += 1
-            while(y_vals[j] > y_vals[pivot] and j  >= low):
-                j -= 1
-            if i < j:
-                temp = y_vals[i]
-                y_vals[i] = y_vals[j]
-                y_vals[j] = temp
-                if weights != NULL:
-                    temp = weights[i]
-                    weights[i] = weights[j]
-                    weights[j] = temp
-        temp = y_vals[j]
-        y_vals[j] = y_vals[pivot]
-        y_vals[pivot] = temp
-
-        if weights != NULL:
-            temp = weights[j]
-            weights[j] = weights[pivot]
-            weights[pivot] = temp
-
-        sort_values_and_weights(y_vals, weights, low, j-1)
-        sort_values_and_weights(y_vals, weights, j+1, high)
-
-
 # rand_r replacement using a 32bit XorShift generator
 # See http://www.jstatsoft.org/v08/i14/paper for details
 cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil:
@@ -250,40 +173,6 @@ cdef class Stack:
 # PriorityHeap data structure
 # =============================================================================
 
-cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil:
-    """Restore heap invariant parent.improvement > child.improvement from
-       ``pos`` upwards. """
-    if pos == 0:
-        return
-
-    cdef SIZE_t parent_pos = (pos - 1) / 2
-
-    if heap[parent_pos].improvement < heap[pos].improvement:
-        heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
-        heapify_up(heap, parent_pos)
-
-
-cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos,
-                       SIZE_t heap_length) nogil:
-    """Restore heap invariant parent.improvement > children.improvement from
-       ``pos`` downwards. """
-    cdef SIZE_t left_pos = 2 * (pos + 1) - 1
-    cdef SIZE_t right_pos = 2 * (pos + 1)
-    cdef SIZE_t largest = pos
-
-    if (left_pos < heap_length and
-            heap[left_pos].improvement > heap[largest].improvement):
-        largest = left_pos
-
-    if (right_pos < heap_length and
-            heap[right_pos].improvement > heap[largest].improvement):
-        largest = right_pos
-
-    if largest != pos:
-        heap[pos], heap[largest] = heap[largest], heap[pos]
-        heapify_down(heap, largest, heap_length)
-
-
 cdef class PriorityHeap:
     """A priority queue implemented as a binary heap.
 
@@ -314,6 +203,38 @@ cdef class PriorityHeap:
     def __dealloc__(self):
         free(self.heap_)
 
+    cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil:
+        """Restore heap invariant parent.improvement > child.improvement from
+           ``pos`` upwards. """
+        if pos == 0:
+            return
+
+        cdef SIZE_t parent_pos = (pos - 1) / 2
+
+        if heap[parent_pos].improvement < heap[pos].improvement:
+            heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
+            self.heapify_up(heap, parent_pos)
+
+    cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos,
+                           SIZE_t heap_length) nogil:
+        """Restore heap invariant parent.improvement > children.improvement from
+           ``pos`` downwards. """
+        cdef SIZE_t left_pos = 2 * (pos + 1) - 1
+        cdef SIZE_t right_pos = 2 * (pos + 1)
+        cdef SIZE_t largest = pos
+
+        if (left_pos < heap_length and
+                heap[left_pos].improvement > heap[largest].improvement):
+            largest = left_pos
+
+        if (right_pos < heap_length and
+                heap[right_pos].improvement > heap[largest].improvement):
+            largest = right_pos
+
+        if largest != pos:
+            heap[pos], heap[largest] = heap[largest], heap[pos]
+            self.heapify_down(heap, largest, heap_length)
+
     cdef bint is_empty(self) nogil:
         return self.heap_ptr <= 0
 
@@ -353,7 +274,7 @@ cdef class PriorityHeap:
         heap[heap_ptr].improvement = improvement
 
         # Heapify up
-        heapify_up(heap, heap_ptr)
+        self.heapify_up(heap, heap_ptr)
 
         # Increase element count
         self.heap_ptr = heap_ptr + 1
@@ -375,8 +296,344 @@ cdef class PriorityHeap:
 
         # Restore heap invariant
         if heap_ptr > 1:
-            heapify_down(heap, 0, heap_ptr - 1)
+            self.heapify_down(heap, 0, heap_ptr - 1)
+
+        self.heap_ptr = heap_ptr - 1
+
+        return 0
+
+# =============================================================================
+# MinMaxHeap data structure
+# =============================================================================
+
+cdef class MinMaxHeap:
+    """A priority queue implemented as a binary heap.
+
+    The heap invariant is that the impurity improvement of the parent record
+    is larger then the impurity improvement of the children.
+
+    Attributes
+    ----------
+    capacity : SIZE_t
+        The capacity of the heap
+
+    heap_ptr : SIZE_t
+        The water mark of the heap; the heap grows from left to right in the
+        array ``heap_``. heap_ptr is always less than capacity.
+
+    heap_ : MinMaxHeapRecord*
+        The array of heap records. The maximum element is on the left;
+        the heap grows from left to right
+
+    mode : bint
+        The mode of the heap. When the value of the ``mode`` parameter passed
+        in at construction is ``max``, the heap is a Max-Heap and mode is set
+        to 1. When the value of the ``mode`` parameter passed in at
+        construction is not ``max``, the heap is a Min-Heap and mode is set
+        to 0.
+    """
+
+    def __cinit__(self, SIZE_t capacity, str mode):
+        self.capacity = capacity
+        if mode == "max":
+            self.mode = 1
+        else:
+            self.mode = 0
+
+        self.heap_ptr = 0
+
+        self.heap_ = <MinMaxHeapRecord*> calloc(capacity, sizeof(MinMaxHeapRecord))
+        if self.heap_ == NULL:
+            raise MemoryError()
+
+    def __dealloc__(self):
+        free(self.heap_)
+
+    cdef void heapify_up(self, MinMaxHeapRecord* heap, SIZE_t pos) nogil:
+        """Restore heap invariant from
+           ``pos`` upwards. """
+        if pos == 0:
+            return
+
+        cdef SIZE_t parent_pos = (pos - 1) / 2
+
+        if self.mode == 1:
+            if heap[parent_pos].data < heap[pos].data:
+                heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
+                self.heapify_up(heap, parent_pos)
+        else:
+            if heap[parent_pos].data > heap[pos].data:
+                heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
+                self.heapify_up(heap, parent_pos)
+
+    cdef void heapify_down(self, MinMaxHeapRecord* heap, SIZE_t pos,
+                           SIZE_t heap_length) nogil:
+        """Restore heap invariant from
+           ``pos`` downwards. """
+        cdef SIZE_t left_pos = 2 * (pos + 1) - 1
+        cdef SIZE_t right_pos = 2 * (pos + 1)
+        cdef SIZE_t candidate = pos
+
+        if self.mode == 1:
+            if (left_pos < heap_length and
+                heap[left_pos].data > heap[candidate].data):
+                candidate = left_pos
+
+            if (right_pos < heap_length and
+                heap[right_pos].data > heap[candidate].data):
+                candidate = right_pos
+        else:
+            if (left_pos < heap_length and
+                heap[left_pos].data < heap[candidate].data):
+                candidate = left_pos
+
+            if (right_pos < heap_length and
+                heap[right_pos].data < heap[candidate].data):
+                candidate = right_pos
+        if candidate != pos:
+            heap[pos], heap[candidate] = heap[candidate], heap[pos]
+            self.heapify_down(heap, candidate, heap_length)
+
+    cdef bint is_empty(self) nogil:
+        return self.heap_ptr <= 0
+
+    cdef SIZE_t size(self) nogil:
+        return self.heap_ptr
+
+    cdef int push(self, DOUBLE_t data) nogil:
+        """Push record on the priority heap.
+
+        Returns 0 if successful; -1 on out of memory error.
+        """
+        cdef SIZE_t heap_ptr = self.heap_ptr
+        cdef MinMaxHeapRecord* heap = NULL
+
+        # Resize if capacity not sufficient
+        if heap_ptr >= self.capacity:
+            self.capacity *= 2
+            heap = <MinMaxHeapRecord*> realloc(self.heap_,
+                                               self.capacity *
+                                               sizeof(MinMaxHeapRecord))
+            if heap == NULL:
+                # no free; __dealloc__ handles that
+                return -1
+            self.heap_ = heap
+
+        # Put element as last element of heap
+        heap = self.heap_
+        heap[heap_ptr].data = data
+
+        # Heapify up
+        self.heapify_up(heap, heap_ptr)
+
+        # Increase element count
+        self.heap_ptr = heap_ptr + 1
+        return 0
+
+    cdef int remove(self, DOUBLE_t value) nogil:
+        """Remove a specific value from heap"""
+        cdef SIZE_t heap_ptr = self.heap_ptr
+        cdef MinMaxHeapRecord* heap = self.heap_
+        cdef SIZE_t idx_to_remove = -1
+        cdef SIZE_t i
+
+        if heap_ptr <= 0:
+            return -1
+
+        # find element to remove
+        for i in range(0, heap_ptr):
+            if heap[i].data == value:
+                idx_to_remove = i
+                break
+        # should we throw an error if the element isn't found?
+        # it shouldn't happen, but better to fail noisily...?
+
+        # put the last element where we want to remove
+        heap[i], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[i]
+
+        # Restore heap invariant
+        if heap_ptr > 1:
+            self.heapify_down(heap, 0, heap_ptr - 1)
+
+        self.heap_ptr = heap_ptr - 1
+
+        return 0
+
+
+    cdef int pop(self, DOUBLE_t* res) nogil:
+        """Remove top element from heap."""
+        cdef SIZE_t heap_ptr = self.heap_ptr
+        cdef MinMaxHeapRecord* heap = self.heap_
+
+        if heap_ptr <= 0:
+            return -1
+
+        # Take first element
+        res[0] = heap[0].data
+
+        # Put last element to the front
+        heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0]
+
+        # Restore heap invariant
+        if heap_ptr > 1:
+            self.heapify_down(heap, 0, heap_ptr - 1)
 
         self.heap_ptr = heap_ptr - 1
 
         return 0
+
+    cdef int peek(self, DOUBLE_t* res) nogil:
+        """Write the top element from heap to a pointer."""
+        cdef SIZE_t heap_ptr = self.heap_ptr
+        cdef MinMaxHeapRecord* heap = self.heap_
+        if heap_ptr <= 0:
+            return -1
+        # Take first value
+        res[0] = heap[0].data
+        return 0
+
+# =============================================================================
+# MedianHeap data structure
+# =============================================================================
+
+cdef class MedianHeap:
+
+    def __cinit__(self, SIZE_t initial_capacity):
+        self.initial_capacity = initial_capacity
+        self.current_capacity = 0
+        self.left_max_heap = MinMaxHeap(initial_capacity, "max")
+        self.right_min_heap = MinMaxHeap(initial_capacity, "min")
+
+    cdef SIZE_t size(self) nogil:
+        return self.current_capacity
+
+    cdef int push(self, DOUBLE_t data) nogil:
+        """Push a value to the MedianHeap to be considered
+        in the median calculation
+        """
+        cdef double current_median
+        cdef int return_value
+
+        if self.current_capacity == 0:
+            return_value = self.left_max_heap.push(data)
+        else:
+            self.get_median(&current_median)
+            if current_median <= data:
+                # data is greater than or equal to current median, so it goes on min heap
+                return_value = self.right_min_heap.push(data)
+            else:
+                # data is less than current median, so it goes on max heap
+                return_value = self.left_max_heap.push(data)
+        self.rebalance()
+        self.current_capacity += 1
+        return return_value
+
+    cdef int remove(self, DOUBLE_t data) nogil:
+        """Remove a value from the MedianHeap, removing it
+        from consideration in the median calculation
+        """
+        cdef double current_median
+        cdef int return_value
+
+        self.get_median(&current_median)
+        if current_median == data:
+            # data is the same value as current median, it is in
+            # the bigger one
+            if self.right_min_heap.size() > self.left_max_heap.size():
+                # it is in the right
+                return_value = self.right_min_heap.remove(data)
+            else:
+                # it is in the left
+                return_value = self.left_max_heap.remove(data)
+        elif current_median < data:
+            # data is greater than or equal to current median, so it is on min heap
+            return_value = self.right_min_heap.remove(data)
+        else:
+            # data is less than current median, so it is on max heap
+            return_value = self.left_max_heap.remove(data)
+        self.rebalance()
+        self.current_capacity -= 1
+        return return_value
+
+    cdef int pop(self, DOUBLE_t* res) nogil:
+        """Pop a value from the MedianHeap, starting from the
+        left and moving to the right.
+        """
+        cdef int return_value
+
+        # no elements to pop
+        if self.current_capacity == 0:
+            return -1
+
+        if self.left_max_heap.size() != 0:
+            # pop from the left
+            return_value = self.left_max_heap.pop(res)
+        elif self.right_min_heap.size() != 0:
+            # pop from right
+            return_value = self.right_min_heap.pop(res)
+        else:
+            return -1
+        self.rebalance()
+        self.current_capacity -= 1
+        return return_value
+
+    cdef int get_median(self, double* data) nogil:
+        """Return the current median"""
+        if self.current_capacity == 0:
+            return -1
+
+        cdef SIZE_t left_max_heap_size = self.left_max_heap.size()
+        cdef SIZE_t right_min_heap_size = self.right_min_heap.size()
+        cdef DOUBLE_t left_max_heap_median
+        cdef DOUBLE_t right_min_heap_median
+
+        if self.current_capacity < 2:
+            # there is only one thing, so set the median to be that
+            if left_max_heap_size >= 1:
+                self.left_max_heap.peek(&left_max_heap_median)
+                data[0] = left_max_heap_median
+            else:
+                self.right_min_heap.peek(&right_min_heap_median)
+                data[0] = right_min_heap_median
+            return 0
+        self.left_max_heap.peek(&left_max_heap_median)
+        self.right_min_heap.peek(&right_min_heap_median)
+
+        if left_max_heap_size == right_min_heap_size:
+            # take the average of the two
+            data[0] = (left_max_heap_median +
+                            right_min_heap_median) / 2.0
+        elif left_max_heap_size > right_min_heap_size:
+            # left max heap larger, so median is at its' top
+            data[0] = left_max_heap_median
+        else:
+            # right min heap is larger, so median is at its' top
+            data[0] = right_min_heap_median
+        return 0
+
+    cdef int rebalance(self) nogil:
+        """Rebalance the left max heap and the left min heap to have a
+        one element or less difference in size"""
+        cdef SIZE_t left_max_heap_size = self.left_max_heap.size()
+        cdef SIZE_t right_min_heap_size = self.right_min_heap.size()
+        cdef SIZE_t size_difference = left_max_heap_size - right_min_heap_size
+        cdef DOUBLE_t popped
+        cdef SIZE_t i
+
+        if size_difference >= -1 and size_difference <= 1:
+            # no balancing needed
+            return 0
+
+        if size_difference > 1:
+            # left max heap bigger
+            for i in range(0, size_difference - 1):
+                # pop from left max heap and push into right min heap
+                self.left_max_heap.pop(&popped)
+                self.right_min_heap.push(popped)
+        else:
+            # right min heap bigger
+            for i in range(0, (size_difference * -1) - 1):
+                # pop from right min heap and push into left max heap
+                self.right_min_heap.pop(&popped)
+                self.left_max_heap.push(popped)
+        return 0

From 39e693ca86ba4f1c222ad605a5238492a9743fa5 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 22 Jun 2016 08:35:38 -0700
Subject: [PATCH 38/75] style: remove extra line

---
 sklearn/tree/_criterion.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 929f6b7af4b0c..e2679fe004b19 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -797,6 +797,7 @@ cdef class RegressionCriterion(Criterion):
 
     cdef void update(self, SIZE_t new_pos) nogil:
         """Updated statistics by moving samples[pos:new_pos] to the left."""
+
         cdef double* sum_left = self.sum_left
         cdef double* sum_right = self.sum_right
         cdef double* sum_total = self.sum_total
@@ -890,6 +891,7 @@ cdef class MSE(RegressionCriterion):
         impurity = self.sq_sum_total / self.weighted_n_node_samples
         for k in range(self.n_outputs):
             impurity -= (sum_total[k] / self.weighted_n_node_samples)**2.0
+
         return impurity / self.n_outputs
 
     cdef double proxy_impurity_improvement(self) nogil:
@@ -914,6 +916,7 @@ cdef class MSE(RegressionCriterion):
         for k in range(self.n_outputs):
             proxy_impurity_left += sum_left[k] * sum_left[k]
             proxy_impurity_right += sum_right[k] * sum_right[k]
+
         return (proxy_impurity_left / self.weighted_n_left +
                 proxy_impurity_right / self.weighted_n_right)
 
@@ -923,7 +926,6 @@ cdef class MSE(RegressionCriterion):
            left child (samples[start:pos]) and the impurity the right child
            (samples[pos:end])."""
 
-
         cdef DOUBLE_t* y = self.y
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples

From 20d6107c7ebfba1b96d483cfacb97a9ac304b3cb Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 23 Jun 2016 12:13:03 +0800
Subject: [PATCH 39/75] style: fix inadvertent cosmetic changes; i'll address
 some of these in a separate PR

---
 sklearn/tree/_criterion.pyx | 7 +++++--
 sklearn/tree/_utils.pxd     | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index e2679fe004b19..3278eb0a2bec7 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -685,6 +685,7 @@ cdef class RegressionCriterion(Criterion):
         var = \sum_i^n (y_i - y_bar) ** 2
             = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2
     """
+
     cdef double sq_sum_total
 
     def __cinit__(self, SIZE_t n_outputs):
@@ -696,6 +697,7 @@ cdef class RegressionCriterion(Criterion):
             The number of targets to be predicted
         """
 
+        # Default values
         self.y = NULL
         self.y_stride = 0
         self.sample_weight = NULL
@@ -771,12 +773,13 @@ cdef class RegressionCriterion(Criterion):
                 self.sq_sum_total += w_y_ik * y_ik
 
             self.weighted_n_node_samples += w
+
+        # Reset to pos=start
         self.reset()
 
     cdef void reset(self) nogil:
         """Reset the criterion at pos=start."""
         cdef SIZE_t n_bytes = self.n_outputs * sizeof(double)
-
         memset(self.sum_left, 0, n_bytes)
         memcpy(self.sum_right, self.sum_total, n_bytes)
 
@@ -784,7 +787,6 @@ cdef class RegressionCriterion(Criterion):
         self.weighted_n_right = self.weighted_n_node_samples
         self.pos = self.start
 
-
     cdef void reverse_reset(self) nogil:
         """Reset the criterion at pos=end."""
         cdef SIZE_t n_bytes = self.n_outputs * sizeof(double)
@@ -926,6 +928,7 @@ cdef class MSE(RegressionCriterion):
            left child (samples[start:pos]) and the impurity the right child
            (samples[pos:end])."""
 
+
         cdef DOUBLE_t* y = self.y
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 90482c4e11c01..ada1620e9820b 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -39,6 +39,7 @@ cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except *
 
 cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size)
 
+
 cdef SIZE_t rand_int(SIZE_t low, SIZE_t high,
                             UINT32_t* random_state) nogil
 

From f73ac8e487301a6138d8013bf4ea185aa5cb92ae Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Fri, 8 Jul 2016 10:15:08 -0700
Subject: [PATCH 40/75] feature: change minmaxheap to internally use sorted
 arrays

---
 sklearn/tree/_utils.pxd |  3 --
 sklearn/tree/_utils.pyx | 91 ++++++++++++-----------------------------
 2 files changed, 26 insertions(+), 68 deletions(-)

diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index ada1620e9820b..f688e8340c605 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -122,9 +122,6 @@ cdef class MinMaxHeap:
     cdef MinMaxHeapRecord* heap_
     cdef bint mode
 
-    cdef void heapify_up(self, MinMaxHeapRecord* heap, SIZE_t pos) nogil
-    cdef void heapify_down(self, MinMaxHeapRecord* heap, SIZE_t pos,
-                           SIZE_t heap_length) nogil
     cdef bint is_empty(self) nogil
     cdef SIZE_t size(self) nogil
     cdef int push(self, DOUBLE_t data) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index b1a01a289d5d3..d54815279ce99 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -309,8 +309,10 @@ cdef class PriorityHeap:
 cdef class MinMaxHeap:
     """A priority queue implemented as a binary heap.
 
-    The heap invariant is that the impurity improvement of the parent record
-    is larger then the impurity improvement of the children.
+    The heap invariant is that the impurity improvement of the parent record is
+    larger then the impurity improvement of the children. The MinHeap is
+    essentially an array sorted in ascending order, and a MaxHeap is an array
+    sorted in descending order.
 
     Attributes
     ----------
@@ -349,51 +351,6 @@ cdef class MinMaxHeap:
     def __dealloc__(self):
         free(self.heap_)
 
-    cdef void heapify_up(self, MinMaxHeapRecord* heap, SIZE_t pos) nogil:
-        """Restore heap invariant from
-           ``pos`` upwards. """
-        if pos == 0:
-            return
-
-        cdef SIZE_t parent_pos = (pos - 1) / 2
-
-        if self.mode == 1:
-            if heap[parent_pos].data < heap[pos].data:
-                heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
-                self.heapify_up(heap, parent_pos)
-        else:
-            if heap[parent_pos].data > heap[pos].data:
-                heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
-                self.heapify_up(heap, parent_pos)
-
-    cdef void heapify_down(self, MinMaxHeapRecord* heap, SIZE_t pos,
-                           SIZE_t heap_length) nogil:
-        """Restore heap invariant from
-           ``pos`` downwards. """
-        cdef SIZE_t left_pos = 2 * (pos + 1) - 1
-        cdef SIZE_t right_pos = 2 * (pos + 1)
-        cdef SIZE_t candidate = pos
-
-        if self.mode == 1:
-            if (left_pos < heap_length and
-                heap[left_pos].data > heap[candidate].data):
-                candidate = left_pos
-
-            if (right_pos < heap_length and
-                heap[right_pos].data > heap[candidate].data):
-                candidate = right_pos
-        else:
-            if (left_pos < heap_length and
-                heap[left_pos].data < heap[candidate].data):
-                candidate = left_pos
-
-            if (right_pos < heap_length and
-                heap[right_pos].data < heap[candidate].data):
-                candidate = right_pos
-        if candidate != pos:
-            heap[pos], heap[candidate] = heap[candidate], heap[pos]
-            self.heapify_down(heap, candidate, heap_length)
-
     cdef bint is_empty(self) nogil:
         return self.heap_ptr <= 0
 
@@ -406,6 +363,7 @@ cdef class MinMaxHeap:
         Returns 0 if successful; -1 on out of memory error.
         """
         cdef SIZE_t heap_ptr = self.heap_ptr
+        cdef SIZE_t i
         cdef MinMaxHeapRecord* heap = NULL
 
         # Resize if capacity not sufficient
@@ -423,8 +381,19 @@ cdef class MinMaxHeap:
         heap = self.heap_
         heap[heap_ptr].data = data
 
-        # Heapify up
-        self.heapify_up(heap, heap_ptr)
+        # bubble last element up according to mode
+        # max heap, sorted in descending order
+        i = heap_ptr
+        if self.mode == 1:
+            while(i != 0 and heap[i].data > heap[i-1].data):
+                heap[i], heap[i-1] = heap[i-1], heap[i]
+                i = i-1
+
+        # min heap, sorted in ascending order
+        else:
+            while(i != 0 and heap[i].data < heap[i-1].data):
+                heap[i], heap[i-1] = heap[i-1], heap[i]
+                i = i-1
 
         # Increase element count
         self.heap_ptr = heap_ptr + 1
@@ -441,29 +410,25 @@ cdef class MinMaxHeap:
             return -1
 
         # find element to remove
-        for i in range(0, heap_ptr):
+        for i in range(heap_ptr):
             if heap[i].data == value:
                 idx_to_remove = i
                 break
         # should we throw an error if the element isn't found?
         # it shouldn't happen, but better to fail noisily...?
 
-        # put the last element where we want to remove
-        heap[i], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[i]
-
-        # Restore heap invariant
-        if heap_ptr > 1:
-            self.heapify_down(heap, 0, heap_ptr - 1)
+        # move after the removed element over by one
+        for i in range(idx_to_remove, heap_ptr-1):
+            heap[i] = heap[i+1]
 
         self.heap_ptr = heap_ptr - 1
-
         return 0
 
-
     cdef int pop(self, DOUBLE_t* res) nogil:
         """Remove top element from heap."""
         cdef SIZE_t heap_ptr = self.heap_ptr
         cdef MinMaxHeapRecord* heap = self.heap_
+        cdef SIZE_t i
 
         if heap_ptr <= 0:
             return -1
@@ -471,15 +436,11 @@ cdef class MinMaxHeap:
         # Take first element
         res[0] = heap[0].data
 
-        # Put last element to the front
-        heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0]
-
-        # Restore heap invariant
-        if heap_ptr > 1:
-            self.heapify_down(heap, 0, heap_ptr - 1)
+        # move after the removed element over by one
+        for i in range(0, heap_ptr-1):
+            heap[i] = heap[i+1]
 
         self.heap_ptr = heap_ptr - 1
-
         return 0
 
     cdef int peek(self, DOUBLE_t* res) nogil:

From 5b8d665c2968c54cd1c07c237ddf0a1438b4cd4c Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sat, 16 Jul 2016 13:02:39 -0500
Subject: [PATCH 41/75] refactored MAE and push to share work

---
 sklearn/tree/_criterion.pyx |  84 +++-----
 sklearn/tree/_utils.pxd     |  45 ++--
 sklearn/tree/_utils.pyx     | 406 ++++++++++++++++++++----------------
 3 files changed, 281 insertions(+), 254 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 3278eb0a2bec7..15372f686852a 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -29,8 +29,7 @@ np.import_array()
 from ._utils cimport log
 from ._utils cimport safe_realloc
 from ._utils cimport sizet_ptr_to_ndarray
-from ._utils cimport MedianHeap
-from ._utils cimport MinMaxHeapRecord
+from ._utils cimport WeightedMedianHeap
 
 cdef class Criterion:
     """Interface for impurity criteria.
@@ -964,7 +963,7 @@ cdef class MSE(RegressionCriterion):
 
         for k in range(self.n_outputs):
             impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0
-            impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 
+            impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0
 
         impurity_left[0] /= self.n_outputs
         impurity_right[0] /= self.n_outputs
@@ -973,7 +972,6 @@ cdef class MAE(RegressionCriterion):
     """Mean absolute error impurity criterion"""
     def __dealloc__(self):
         """Destructor."""
-
         free(self.node_medians)
 
     cdef np.ndarray left_child_heaps
@@ -1039,14 +1037,13 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t p
         cdef SIZE_t k
         cdef DOUBLE_t y_ik
-        cdef DOUBLE_t w_y_ik
         cdef DOUBLE_t w = 1.0
 
         # Fill accumulators with MedianHeaps
         with gil:
             for k in range(self.n_outputs):
-                self.left_child_heaps[k] = MedianHeap(self.n_node_samples)
-                self.right_child_heaps[k] = MedianHeap(self.n_node_samples)
+                self.left_child_heaps[k] = WeightedMedianHeap(self.n_node_samples)
+                self.right_child_heaps[k] = WeightedMedianHeap(self.n_node_samples)
 
         cdef void** left_child_heaps = <void**> self.left_child_heaps.data
         cdef void** right_child_heaps = <void**> self.right_child_heaps.data
@@ -1059,16 +1056,16 @@ cdef class MAE(RegressionCriterion):
 
             for k in range(self.n_outputs):
                 y_ik = y[i * y_stride + k]
-                w_y_ik = w * y_ik
 
                 # push all values to the right side,
                 # since pos = start initially anyway
-                (<MedianHeap> right_child_heaps[k]).push(w_y_ik)
+                (<WeightedMedianHeap> right_child_heaps[k]).push(y_ik, w)
 
             self.weighted_n_node_samples += w
+
         # calculate the node medians
         for k in range(self.n_outputs):
-            (<MedianHeap> right_child_heaps[k]).get_median(&(self.node_medians[k]))
+            (<WeightedMedianHeap> right_child_heaps[k]).get_median(&(self.node_medians[k]))
 
         # Reset to pos=start
         self.reset()
@@ -1078,7 +1075,8 @@ cdef class MAE(RegressionCriterion):
 
         cdef SIZE_t i
         cdef SIZE_t k
-        cdef DOUBLE_t popped
+        cdef DOUBLE_t popped_value
+        cdef DOUBLE_t popped_weight
 
         cdef void** left_child_heaps = <void**> self.left_child_heaps.data
         cdef void** right_child_heaps = <void**> self.right_child_heaps.data
@@ -1092,10 +1090,12 @@ cdef class MAE(RegressionCriterion):
 
         for k in range(self.n_outputs):
             # if left has no elements, it's already reset
-            for i in range((<MedianHeap> left_child_heaps[k]).size()):
+            for i in range((<WeightedMedianHeap> left_child_heaps[k]).size()):
                 # remove everything from left and put it into right
-                (<MedianHeap> left_child_heaps[k]).pop(&popped)
-                (<MedianHeap> right_child_heaps[k]).push(popped)
+                (<WeightedMedianHeap> left_child_heaps[k]).pop(&popped_value,
+                                                               &popped_weight)
+                (<WeightedMedianHeap> right_child_heaps[k]).push(popped_value,
+                                                         popped_weight)
 
     cdef void reverse_reset(self) nogil:
         """Reset the criterion at pos=end."""
@@ -1104,7 +1104,8 @@ cdef class MAE(RegressionCriterion):
         self.weighted_n_left = self.weighted_n_node_samples
         self.pos = self.end
 
-        cdef DOUBLE_t popped
+        cdef DOUBLE_t popped_value
+        cdef DOUBLE_t popped_weight
         cdef void** left_child_heaps = <void**> self.left_child_heaps.data
         cdef void** right_child_heaps = <void**> self.right_child_heaps.data
 
@@ -1112,10 +1113,12 @@ cdef class MAE(RegressionCriterion):
         # left should have all elements.
         for k in range(self.n_outputs):
             # if right has no elements, it's already reset
-            for i in range((<MedianHeap> right_child_heaps[k]).size()):
+            for i in range((<WeightedMedianHeap> right_child_heaps[k]).size()):
                 # remove everything from right and put it into left
-                (<MedianHeap> right_child_heaps[k]).pop(&popped)
-                (<MedianHeap> left_child_heaps[k]).push(popped)
+                (<WeightedMedianHeap> right_child_heaps[k]).pop(&popped_value,
+                                                                &popped_weight)
+                (<WeightedMedianHeap> left_child_heaps[k]).push(popped_value,
+                                                                popped_weight)
 
     cdef void update(self, SIZE_t new_pos) nogil:
         """Updated statistics by moving samples[pos:new_pos] to the left."""
@@ -1134,7 +1137,6 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t k
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t y_ik
-        cdef DOUBLE_t w_y_ik
 
         # Update statistics up to new_pos
         #
@@ -1151,11 +1153,9 @@ cdef class MAE(RegressionCriterion):
 
                 for k in range(self.n_outputs):
                     y_ik = y[i * self.y_stride + k]
-                    w_y_ik = w * y_ik
-
-                    # remove w_y_ik from right and add to left
-                    (<MedianHeap> right_child_heaps[k]).remove(w_y_ik)
-                    (<MedianHeap> left_child_heaps[k]).push(w_y_ik)
+                    # remove y_ik with weight w from right and add to left
+                    (<WeightedMedianHeap> right_child_heaps[k]).remove(y_ik, w)
+                    (<WeightedMedianHeap> left_child_heaps[k]).push(y_ik, w)
 
                 self.weighted_n_left += w
         else:
@@ -1169,15 +1169,13 @@ cdef class MAE(RegressionCriterion):
 
                 for k in range(self.n_outputs):
                     y_ik = y[i * self.y_stride + k]
-                    w_y_ik = w * y_ik
-
-                    # remove w_y_ik from left and add to right
-                    (<MedianHeap> left_child_heaps[k]).remove(w_y_ik)
-                    (<MedianHeap> right_child_heaps[k]).push(w_y_ik)
+                    # remove y_ik from left and add to right
+                    (<WeightedMedianHeap> left_child_heaps[k]).remove(y_ik, w)
+                    (<WeightedMedianHeap> right_child_heaps[k]).push(y_ik, w)
 
                 self.weighted_n_left -= w
 
-        self.weighted_n_right = (self.weighted_n_node_samples - 
+        self.weighted_n_right = (self.weighted_n_node_samples -
                                  self.weighted_n_left)
         self.pos = new_pos
 
@@ -1196,7 +1194,6 @@ cdef class MAE(RegressionCriterion):
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
         cdef SIZE_t i, p, k
-        cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t y_ik
         cdef DOUBLE_t w_y_ik
 
@@ -1206,13 +1203,9 @@ cdef class MAE(RegressionCriterion):
             for p in range(self.start, self.end):
                 i = samples[p]
 
-                if sample_weight != NULL:
-                    w = sample_weight[i]
-
                 y_ik = y[i * self.y_stride + k]
-                w_y_ik = w * y_ik
 
-                impurity += <double> fabs((<double> w_y_ik) - self.node_medians[k])
+                impurity += <double> fabs((<double> y_ik) - self.node_medians[k])
         return impurity / (self.weighted_n_node_samples * self.n_outputs)
 
     cdef void children_impurity(self, double* impurity_left,
@@ -1231,8 +1224,6 @@ cdef class MAE(RegressionCriterion):
 
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
-        cdef DOUBLE_t w = 1.0
-        cdef DOUBLE_t w_y_ik
         cdef DOUBLE_t median
 
         cdef void** left_child_heaps = <void**> self.left_child_heaps.data
@@ -1242,29 +1233,22 @@ cdef class MAE(RegressionCriterion):
         impurity_right[0] = 0.0
 
         for k in range(self.n_outputs):
-            (<MedianHeap> left_child_heaps[k]).get_median(&median)
+            (<WeightedMedianHeap> left_child_heaps[k]).get_median(&median)
             for p in range(start, pos):
                 i = samples[p]
 
-                if sample_weight != NULL:
-                    w = sample_weight[i]
-
                 y_ik = y[i * self.y_stride + k]
-                w_y_ik = w * y_ik
-                impurity_left[0] += <double>fabs((<double> w_y_ik) - median)
+
+                impurity_left[0] += <double>fabs((<double> y_ik) - median)
         impurity_left[0] /= <double>((pos - start) * self.n_outputs)
 
         for k in range(self.n_outputs):
-            (<MedianHeap> right_child_heaps[k]).get_median(&median)
+            (<WeightedMedianHeap> right_child_heaps[k]).get_median(&median)
             for p in range(pos, end):
                 i = samples[p]
 
-                if sample_weight != NULL:
-                    w = sample_weight[i]
-
                 y_ik = y[i * self.y_stride + k]
-                w_y_ik = w * y_ik
-                impurity_right[0] += <double>fabs((<double> w_y_ik) - median)
+                impurity_right[0] += <double>fabs((<double> y_ik) - median)
         impurity_right[0] /= <double>((end - pos) * self.n_outputs)
 
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index f688e8340c605..9d5b8a725b371 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -109,39 +109,46 @@ cdef class PriorityHeap:
                            SIZE_t heap_length) nogil
 
 # =============================================================================
-# MinMaxHeap data structure
+# WeightedPQueue data structure
 # =============================================================================
 
-# A record stored in the MinMaxHeap
-cdef struct MinMaxHeapRecord:
+# A record stored in the WeightedPQueue
+cdef struct WeightedPQueueRecord:
     DOUBLE_t data
+    DOUBLE_t weight
 
-cdef class MinMaxHeap:
+cdef class WeightedPQueue:
     cdef SIZE_t capacity
-    cdef SIZE_t heap_ptr
-    cdef MinMaxHeapRecord* heap_
-    cdef bint mode
+    cdef SIZE_t array_ptr
+    cdef WeightedPQueueRecord* array_
 
     cdef bint is_empty(self) nogil
     cdef SIZE_t size(self) nogil
-    cdef int push(self, DOUBLE_t data) nogil
-    cdef int remove(self, DOUBLE_t value) nogil
-    cdef int pop(self, DOUBLE_t* res) nogil
-    cdef int peek(self, DOUBLE_t* res) nogil
+    cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil
+    cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil
+    cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil
+    cdef int peek(self, DOUBLE_t* res, DOUBLE_t* weight) nogil
+    cdef int get_index_data(self, SIZE_t idx, DOUBLE_t* value, DOUBLE_t* weight) nogil
 
 # =============================================================================
 # MedianHeap data structure
 # =============================================================================
 
-cdef class MedianHeap:
+cdef class WeightedMedianHeap:
     cdef SIZE_t initial_capacity
     cdef SIZE_t current_capacity
-    cdef MinMaxHeap right_min_heap
-    cdef MinMaxHeap left_max_heap
+    cdef WeightedPQueue samples
+    cdef DOUBLE_t total_weight
+    cdef SIZE_t k
+    cdef DOUBLE_t sum_w_0_k # represents sum(weights[0:k])
+                            # = w[0] + w[1] + ... + w[k-1]
 
     cdef SIZE_t size(self) nogil
-    cdef int pop(self, DOUBLE_t* res) nogil
-    cdef int push(self, DOUBLE_t data) nogil
-    cdef int remove(self, DOUBLE_t data) nogil
-    cdef int get_median(self, DOUBLE_t* data) nogil
-    cdef int rebalance(self) nogil
+    cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil
+    cdef int update_median_parameters_post_push(self, DOUBLE_t data, DOUBLE_t weight) nogil
+    cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil
+    cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil
+    cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil
+    cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil
+    cdef int update_median_parameters_post_remove(self, DOUBLE_t data, DOUBLE_t weight) nogil
+    cdef int get_median(self, double* median) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index d54815279ce99..17e9361fc8f6c 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -303,220 +303,263 @@ cdef class PriorityHeap:
         return 0
 
 # =============================================================================
-# MinMaxHeap data structure
+# WeightedPQueue data structure
 # =============================================================================
 
-cdef class MinMaxHeap:
-    """A priority queue implemented as a binary heap.
-
-    The heap invariant is that the impurity improvement of the parent record is
-    larger then the impurity improvement of the children. The MinHeap is
-    essentially an array sorted in ascending order, and a MaxHeap is an array
-    sorted in descending order.
+cdef class WeightedPQueue:
+    """A priority queue class, always sorted in increasing order.
 
     Attributes
     ----------
     capacity : SIZE_t
-        The capacity of the heap
+        The capacity of the array
 
-    heap_ptr : SIZE_t
-        The water mark of the heap; the heap grows from left to right in the
-        array ``heap_``. heap_ptr is always less than capacity.
+    array_ptr : SIZE_t
+        The water mark of the array; the array grows from left to right in the
+        array ``array_``. array_ptr is always less than capacity.
 
-    heap_ : MinMaxHeapRecord*
-        The array of heap records. The maximum element is on the left;
-        the heap grows from left to right
-
-    mode : bint
-        The mode of the heap. When the value of the ``mode`` parameter passed
-        in at construction is ``max``, the heap is a Max-Heap and mode is set
-        to 1. When the value of the ``mode`` parameter passed in at
-        construction is not ``max``, the heap is a Min-Heap and mode is set
-        to 0.
+    array_ : WeightedPQueueRecord*
+        The array of array records. The minimum element is on the left;
+        the array grows from left to right
     """
 
-    def __cinit__(self, SIZE_t capacity, str mode):
+    def __cinit__(self, SIZE_t capacity):
         self.capacity = capacity
-        if mode == "max":
-            self.mode = 1
-        else:
-            self.mode = 0
+        self.array_ptr = 0
 
-        self.heap_ptr = 0
+        self.array_ = <WeightedPQueueRecord*> calloc(capacity, sizeof(WeightedPQueueRecord))
 
-        self.heap_ = <MinMaxHeapRecord*> calloc(capacity, sizeof(MinMaxHeapRecord))
-        if self.heap_ == NULL:
+        if self.array_ == NULL:
             raise MemoryError()
 
     def __dealloc__(self):
-        free(self.heap_)
+        free(self.array_)
 
     cdef bint is_empty(self) nogil:
-        return self.heap_ptr <= 0
+        return self.array_ptr <= 0
 
     cdef SIZE_t size(self) nogil:
-        return self.heap_ptr
-
-    cdef int push(self, DOUBLE_t data) nogil:
-        """Push record on the priority heap.
+        return self.array_ptr
 
+    cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil:
+        """Push record on the array.
         Returns 0 if successful; -1 on out of memory error.
         """
-        cdef SIZE_t heap_ptr = self.heap_ptr
+        cdef SIZE_t array_ptr = self.array_ptr
         cdef SIZE_t i
-        cdef MinMaxHeapRecord* heap = NULL
+        cdef WeightedPQueueRecord* array
 
         # Resize if capacity not sufficient
-        if heap_ptr >= self.capacity:
+        if array_ptr >= self.capacity:
             self.capacity *= 2
-            heap = <MinMaxHeapRecord*> realloc(self.heap_,
+            array = <WeightedPQueueRecord*> realloc(self.array_,
                                                self.capacity *
-                                               sizeof(MinMaxHeapRecord))
-            if heap == NULL:
+                                               sizeof(WeightedPQueueRecord))
+
+            if array == NULL:
                 # no free; __dealloc__ handles that
                 return -1
-            self.heap_ = heap
+            self.array_ = array
 
-        # Put element as last element of heap
-        heap = self.heap_
-        heap[heap_ptr].data = data
-
-        # bubble last element up according to mode
-        # max heap, sorted in descending order
-        i = heap_ptr
-        if self.mode == 1:
-            while(i != 0 and heap[i].data > heap[i-1].data):
-                heap[i], heap[i-1] = heap[i-1], heap[i]
-                i = i-1
-
-        # min heap, sorted in ascending order
-        else:
-            while(i != 0 and heap[i].data < heap[i-1].data):
-                heap[i], heap[i-1] = heap[i-1], heap[i]
-                i = i-1
+        # Put element as last element of array
+        array = self.array_
+        array[array_ptr].data = data
+        array[array_ptr].weight = weight
+
+        # bubble last element up according until it is sorted
+        # in ascending order
+        i = array_ptr
+        while(i != 0 and array[i].data < array[i-1].data):
+            array[i], array[i-1] = array[i-1], array[i]
+            i -= 1
 
         # Increase element count
-        self.heap_ptr = heap_ptr + 1
+        self.array_ptr = array_ptr + 1
         return 0
 
-    cdef int remove(self, DOUBLE_t value) nogil:
-        """Remove a specific value from heap"""
-        cdef SIZE_t heap_ptr = self.heap_ptr
-        cdef MinMaxHeapRecord* heap = self.heap_
+    cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil:
+        """Remove a specific value from array"""
+        cdef SIZE_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = self.array_
         cdef SIZE_t idx_to_remove = -1
         cdef SIZE_t i
 
-        if heap_ptr <= 0:
+        if array_ptr <= 0:
             return -1
 
         # find element to remove
-        for i in range(heap_ptr):
-            if heap[i].data == value:
+        for i in range(array_ptr):
+            if array[i].data == value and array[i].weight == weight:
                 idx_to_remove = i
                 break
+
         # should we throw an error if the element isn't found?
         # it shouldn't happen, but better to fail noisily...?
+        if idx_to_remove == -1:
+            with gil:
+                raise ValueError()
 
         # move after the removed element over by one
-        for i in range(idx_to_remove, heap_ptr-1):
-            heap[i] = heap[i+1]
+        for i in range(idx_to_remove, array_ptr-1):
+            array[i] = array[i+1]
 
-        self.heap_ptr = heap_ptr - 1
+        self.array_ptr = array_ptr - 1
         return 0
 
-    cdef int pop(self, DOUBLE_t* res) nogil:
-        """Remove top element from heap."""
-        cdef SIZE_t heap_ptr = self.heap_ptr
-        cdef MinMaxHeapRecord* heap = self.heap_
+    cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil:
+        """Remove top element from array."""
+        cdef SIZE_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = self.array_
         cdef SIZE_t i
 
-        if heap_ptr <= 0:
+        if array_ptr <= 0:
             return -1
 
         # Take first element
-        res[0] = heap[0].data
+        data[0] = array[0].data
+        weight[0] = array[0].weight
 
         # move after the removed element over by one
-        for i in range(0, heap_ptr-1):
-            heap[i] = heap[i+1]
+        for i in range(0, array_ptr-1):
+            array[i] = array[i+1]
 
-        self.heap_ptr = heap_ptr - 1
+        self.array_ptr = array_ptr - 1
         return 0
 
-    cdef int peek(self, DOUBLE_t* res) nogil:
-        """Write the top element from heap to a pointer."""
-        cdef SIZE_t heap_ptr = self.heap_ptr
-        cdef MinMaxHeapRecord* heap = self.heap_
-        if heap_ptr <= 0:
+    cdef int peek(self, DOUBLE_t* data, DOUBLE_t* weight) nogil:
+        """Write the top element from array to a pointer."""
+        cdef SIZE_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = self.array_
+        if array_ptr <= 0:
             return -1
         # Take first value
-        res[0] = heap[0].data
+        data[0] = array[0].data
+        weight[0] = array[0].weight
+        return 0
+
+    cdef int get_index_data(self, SIZE_t idx, DOUBLE_t* value,
+                                 DOUBLE_t* weight) nogil:
+        """Write value and weight at the specified index to a pointer."""
+        cdef SIZE_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = self.array_
+
+        if array_ptr <= 0:
+            return -1
+        # Take value at idx
+        value[0] = array[idx].data
+
+        # Take weight at idx
+        weight[0] = array[idx].weight
         return 0
 
 # =============================================================================
-# MedianHeap data structure
+# WeightedMedianHeap data structure
 # =============================================================================
 
-cdef class MedianHeap:
+cdef class WeightedMedianHeap:
 
     def __cinit__(self, SIZE_t initial_capacity):
         self.initial_capacity = initial_capacity
         self.current_capacity = 0
-        self.left_max_heap = MinMaxHeap(initial_capacity, "max")
-        self.right_min_heap = MinMaxHeap(initial_capacity, "min")
+        self.samples = WeightedPQueue(initial_capacity)
+        self.total_weight = 0
+        self.k = 0
+        self.sum_w_0_k = 0
 
     cdef SIZE_t size(self) nogil:
         return self.current_capacity
 
-    cdef int push(self, DOUBLE_t data) nogil:
-        """Push a value to the MedianHeap to be considered
+    cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil:
+        """Push a value and its associated weight
+        to the WeightedMedianHeap to be considered
         in the median calculation
         """
         cdef double current_median
         cdef int return_value
 
-        if self.current_capacity == 0:
-            return_value = self.left_max_heap.push(data)
-        else:
-            self.get_median(&current_median)
-            if current_median <= data:
-                # data is greater than or equal to current median, so it goes on min heap
-                return_value = self.right_min_heap.push(data)
-            else:
-                # data is less than current median, so it goes on max heap
-                return_value = self.left_max_heap.push(data)
-        self.rebalance()
+        return_value = self.samples.push(data, weight)
         self.current_capacity += 1
+        self.total_weight += weight
+        self.update_median_parameters_post_push(data, weight)
         return return_value
 
-    cdef int remove(self, DOUBLE_t data) nogil:
+    cdef int update_median_parameters_post_push(self, DOUBLE_t data,
+                                                DOUBLE_t weight) nogil:
+        """Update the parameters used in the median calculation,
+        namely `k` and `sum_w_0_k` after an insertion"""
+        cdef double current_median
+
+        # trivial case of one element.
+        if self.current_capacity == 1:
+            self.k = 1
+            self.sum_w_0_k = self.total_weight
+            return 0
+
+        # get the current weighted median
+        self.get_median(&current_median)
+
+        # check if the value inserted is the same as the current median
+        if data == current_median:
+            # k stays the same, but add weight to sum_w_0_k
+            self.sum_w_0_k += weight
+            return 0
+
+        if data < current_median:
+            # inserting below the median, so increment k and
+            # then update self.sum_w_0_k accordingly by adding
+            # the weight that was added.
+            self.k += 1
+            # update sum_w_0_k by adding the weight added
+            self.sum_w_0_k += weight
+
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            # minimum value of k is 1
+            while(self.k != 1 and (self.sum_w_0_k - self.get_weight_from_index(self.k-1) >= self.total_weight / 2)):
+                # ordering of these statements is very important
+                self.k -= 1
+                self.sum_w_0_k -= self.get_weight_from_index(self.k)
+            return 0
+
+        if data > current_median:
+            # inserting above the median
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            while(self.k != self.current_capacity and self.sum_w_0_k < self.total_weight / 2):
+                self.k += 1
+                self.sum_w_0_k += self.get_weight_from_index(self.k-1)
+            return 0
+
+    cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil:
+        """Given an index between [0,self.current_capacity], access
+        the appropriate heap and return the requested weight"""
+        cdef DOUBLE_t value
+        cdef DOUBLE_t weight
+
+        self.samples.get_index_data(index, &value, &weight)
+        return weight
+
+    cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil:
+        """Given an index between [0,self.current_capacity], access
+            the appropriate heap and return the requested value"""
+        cdef DOUBLE_t value
+        cdef DOUBLE_t weight
+
+        self.samples.get_index_data(index, &value, &weight)
+        return value
+
+    cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil:
         """Remove a value from the MedianHeap, removing it
         from consideration in the median calculation
         """
-        cdef double current_median
+        cdef double current_unweighted_median
         cdef int return_value
 
-        self.get_median(&current_median)
-        if current_median == data:
-            # data is the same value as current median, it is in
-            # the bigger one
-            if self.right_min_heap.size() > self.left_max_heap.size():
-                # it is in the right
-                return_value = self.right_min_heap.remove(data)
-            else:
-                # it is in the left
-                return_value = self.left_max_heap.remove(data)
-        elif current_median < data:
-            # data is greater than or equal to current median, so it is on min heap
-            return_value = self.right_min_heap.remove(data)
-        else:
-            # data is less than current median, so it is on max heap
-            return_value = self.left_max_heap.remove(data)
-        self.rebalance()
+        return_value = self.samples.remove(data, weight)
         self.current_capacity -= 1
+        self.total_weight -= weight
+        self.update_median_parameters_post_remove(data, weight)
         return return_value
 
-    cdef int pop(self, DOUBLE_t* res) nogil:
+    cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil:
         """Pop a value from the MedianHeap, starting from the
         left and moving to the right.
         """
@@ -526,75 +569,68 @@ cdef class MedianHeap:
         if self.current_capacity == 0:
             return -1
 
-        if self.left_max_heap.size() != 0:
-            # pop from the left
-            return_value = self.left_max_heap.pop(res)
-        elif self.right_min_heap.size() != 0:
-            # pop from right
-            return_value = self.right_min_heap.pop(res)
-        else:
-            return -1
-        self.rebalance()
+        return_value = self.samples.pop(data, weight)
         self.current_capacity -= 1
+        self.total_weight -= weight[0]
+        self.update_median_parameters_post_remove(data[0],
+                                                  weight[0])
         return return_value
 
-    cdef int get_median(self, double* data) nogil:
-        """Return the current median"""
-        if self.current_capacity == 0:
-            return -1
+    cdef int update_median_parameters_post_remove(self, DOUBLE_t data,
+                                                  DOUBLE_t weight) nogil:
+        """Update the parameters used in the median calculation,
+        namely `k` and `sum_w_0_k` after a removal"""
+        cdef DOUBLE_t current_median
+        # trivial case of one element.
+        if self.current_capacity == 1:
+            self.k = 1
+            self.sum_w_0_k = self.total_weight
+            return 0
+
+        # get the current weighted median
+        self.get_median(&current_median)
 
-        cdef SIZE_t left_max_heap_size = self.left_max_heap.size()
-        cdef SIZE_t right_min_heap_size = self.right_min_heap.size()
-        cdef DOUBLE_t left_max_heap_median
-        cdef DOUBLE_t right_min_heap_median
-
-        if self.current_capacity < 2:
-            # there is only one thing, so set the median to be that
-            if left_max_heap_size >= 1:
-                self.left_max_heap.peek(&left_max_heap_median)
-                data[0] = left_max_heap_median
-            else:
-                self.right_min_heap.peek(&right_min_heap_median)
-                data[0] = right_min_heap_median
+        # check if the value removed is the same as the current median
+        if data == current_median:
+            # k stays the same, but remove weight from sum_w_0_k
+            self.sum_w_0_k -= weight
             return 0
-        self.left_max_heap.peek(&left_max_heap_median)
-        self.right_min_heap.peek(&right_min_heap_median)
-
-        if left_max_heap_size == right_min_heap_size:
-            # take the average of the two
-            data[0] = (left_max_heap_median +
-                            right_min_heap_median) / 2.0
-        elif left_max_heap_size > right_min_heap_size:
-            # left max heap larger, so median is at its' top
-            data[0] = left_max_heap_median
-        else:
-            # right min heap is larger, so median is at its' top
-            data[0] = right_min_heap_median
-        return 0
 
-    cdef int rebalance(self) nogil:
-        """Rebalance the left max heap and the left min heap to have a
-        one element or less difference in size"""
-        cdef SIZE_t left_max_heap_size = self.left_max_heap.size()
-        cdef SIZE_t right_min_heap_size = self.right_min_heap.size()
-        cdef SIZE_t size_difference = left_max_heap_size - right_min_heap_size
-        cdef DOUBLE_t popped
-        cdef SIZE_t i
+        if data < current_median:
+            # removing below the median, so decrement k and
+            # then update self.sum_w_0_k accordingly by subtracting
+            # the removed weight
+            self.k -= 1
+            # update sum_w_0_k by removing the weight at index k
+            self.sum_w_0_k -= weight
+
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            # by incrementing k and updating sum_w_0_k accordingly
+            # until the condition is met.
+            while(self.k != self.current_capacity and self.sum_w_0_k < self.total_weight / 2):
+                # ordering of these statements is very important
+                self.k += 1
+                self.sum_w_0_k += self.get_weight_from_index(self.k-1)
+            return 0
 
-        if size_difference >= -1 and size_difference <= 1:
-            # no balancing needed
+        if data > current_median:
+            # removing above the median
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            while(self.k != 1 and self.sum_w_0_k - self.get_weight_from_index(self.k-1) >= self.total_weight / 2):
+                # mind the ordering
+                self.k -= 1
+                self.sum_w_0_k -= self.get_weight_from_index(self.k)
             return 0
 
-        if size_difference > 1:
-            # left max heap bigger
-            for i in range(0, size_difference - 1):
-                # pop from left max heap and push into right min heap
-                self.left_max_heap.pop(&popped)
-                self.right_min_heap.push(popped)
-        else:
-            # right min heap bigger
-            for i in range(0, (size_difference * -1) - 1):
-                # pop from right min heap and push into left max heap
-                self.right_min_heap.pop(&popped)
-                self.left_max_heap.push(popped)
+    cdef int get_median(self, double* median) nogil:
+        """Write the median to a pointer, taking into account
+        sample weights."""
+        if self.sum_w_0_k < (self.total_weight / 2.0):
+            return -1
+        if self.sum_w_0_k == (self.total_weight / 2.0):
+            # split median
+            median[0] = (self.get_value_from_index(self.k) + self.get_value_from_index(self.k-1))/2
+        if self.sum_w_0_k > (self.total_weight / 2.0):
+            # whole median
+            median[0] = self.get_value_from_index(self.k-1)
         return 0

From 9920cfcd91c44a771d1a7d50f001a73382428338 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 12:00:08 -0500
Subject: [PATCH 42/75] fix errors wrt median insertion case

---
 sklearn/tree/_criterion.pyx |  14 +++-
 sklearn/tree/_utils.pxd     |   7 +-
 sklearn/tree/_utils.pyx     | 139 +++++++++++++++++++-----------------
 3 files changed, 91 insertions(+), 69 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 15372f686852a..2367bcb5e1c58 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1038,7 +1038,6 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t k
         cdef DOUBLE_t y_ik
         cdef DOUBLE_t w = 1.0
-
         # Fill accumulators with MedianHeaps
         with gil:
             for k in range(self.n_outputs):
@@ -1221,6 +1220,10 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t start = self.start
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
+        # with gil:
+        #     print "start {}".format(start)
+        #     print "pos {}".format(pos)
+        #     print "end {}".format(end)
 
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
@@ -1234,6 +1237,8 @@ cdef class MAE(RegressionCriterion):
 
         for k in range(self.n_outputs):
             (<WeightedMedianHeap> left_child_heaps[k]).get_median(&median)
+            # with gil:
+                # print "median {}".format(median)
             for p in range(start, pos):
                 i = samples[p]
 
@@ -1241,15 +1246,22 @@ cdef class MAE(RegressionCriterion):
 
                 impurity_left[0] += <double>fabs((<double> y_ik) - median)
         impurity_left[0] /= <double>((pos - start) * self.n_outputs)
+        # with gil:
+            # print "impurity_left[0] {}".format(impurity_left[0])
 
         for k in range(self.n_outputs):
             (<WeightedMedianHeap> right_child_heaps[k]).get_median(&median)
+            # with gil:
+                # print "median {}".format(median)
             for p in range(pos, end):
                 i = samples[p]
 
                 y_ik = y[i * self.y_stride + k]
+
                 impurity_right[0] += <double>fabs((<double> y_ik) - median)
         impurity_right[0] /= <double>((end - pos) * self.n_outputs)
+        # with gil:
+            # print "impurity_right[0] {}".format(impurity_right[0])
 
 
 cdef class FriedmanMSE(MSE):
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 9d5b8a725b371..2f31f915a3191 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -128,7 +128,9 @@ cdef class WeightedPQueue:
     cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil
     cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil
     cdef int peek(self, DOUBLE_t* res, DOUBLE_t* weight) nogil
-    cdef int get_index_data(self, SIZE_t idx, DOUBLE_t* value, DOUBLE_t* weight) nogil
+    cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil
+    cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil
+
 
 # =============================================================================
 # MedianHeap data structure
@@ -136,7 +138,6 @@ cdef class WeightedPQueue:
 
 cdef class WeightedMedianHeap:
     cdef SIZE_t initial_capacity
-    cdef SIZE_t current_capacity
     cdef WeightedPQueue samples
     cdef DOUBLE_t total_weight
     cdef SIZE_t k
@@ -146,8 +147,6 @@ cdef class WeightedMedianHeap:
     cdef SIZE_t size(self) nogil
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil
     cdef int update_median_parameters_post_push(self, DOUBLE_t data, DOUBLE_t weight) nogil
-    cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil
-    cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil
     cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil
     cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil
     cdef int update_median_parameters_post_remove(self, DOUBLE_t data, DOUBLE_t weight) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 17e9361fc8f6c..7cd536080bc77 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -326,7 +326,6 @@ cdef class WeightedPQueue:
     def __cinit__(self, SIZE_t capacity):
         self.capacity = capacity
         self.array_ptr = 0
-
         self.array_ = <WeightedPQueueRecord*> calloc(capacity, sizeof(WeightedPQueueRecord))
 
         if self.array_ == NULL:
@@ -346,15 +345,15 @@ cdef class WeightedPQueue:
         Returns 0 if successful; -1 on out of memory error.
         """
         cdef SIZE_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = NULL
         cdef SIZE_t i
-        cdef WeightedPQueueRecord* array
 
         # Resize if capacity not sufficient
         if array_ptr >= self.capacity:
             self.capacity *= 2
             array = <WeightedPQueueRecord*> realloc(self.array_,
-                                               self.capacity *
-                                               sizeof(WeightedPQueueRecord))
+                                                    self.capacity *
+                                                    sizeof(WeightedPQueueRecord))
 
             if array == NULL:
                 # no free; __dealloc__ handles that
@@ -437,20 +436,35 @@ cdef class WeightedPQueue:
         weight[0] = array[0].weight
         return 0
 
-    cdef int get_index_data(self, SIZE_t idx, DOUBLE_t* value,
-                                 DOUBLE_t* weight) nogil:
-        """Write value and weight at the specified index to a pointer."""
+    cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil:
+        """Given an index between [0,self.current_capacity], access
+        the appropriate heap and return the requested weight"""
         cdef SIZE_t array_ptr = self.array_ptr
         cdef WeightedPQueueRecord* array = self.array_
 
-        if array_ptr <= 0:
+        if array_ptr <= 0 or index >= array_ptr:
+            with gil:
+                print index
+                print array_ptr
+                print "FALIED ON WEIGHT"
             return -1
-        # Take value at idx
-        value[0] = array[idx].data
-
         # Take weight at idx
-        weight[0] = array[idx].weight
-        return 0
+        return array[index].weight
+
+    cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil:
+        """Given an index between [0,self.current_capacity], access
+        the appropriate heap and return the requested value"""
+        cdef SIZE_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = self.array_
+
+        if array_ptr <= 0 or index >= array_ptr:
+            with gil:
+                print index
+                print array_ptr
+                print "FALIED ON VALUE"
+            return -1
+        # Take value at idx
+        return array[index].data
 
 # =============================================================================
 # WeightedMedianHeap data structure
@@ -460,14 +474,13 @@ cdef class WeightedMedianHeap:
 
     def __cinit__(self, SIZE_t initial_capacity):
         self.initial_capacity = initial_capacity
-        self.current_capacity = 0
         self.samples = WeightedPQueue(initial_capacity)
         self.total_weight = 0
         self.k = 0
         self.sum_w_0_k = 0
 
     cdef SIZE_t size(self) nogil:
-        return self.current_capacity
+        return self.samples.size()
 
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil:
         """Push a value and its associated weight
@@ -478,8 +491,6 @@ cdef class WeightedMedianHeap:
         cdef int return_value
 
         return_value = self.samples.push(data, weight)
-        self.current_capacity += 1
-        self.total_weight += weight
         self.update_median_parameters_post_push(data, weight)
         return return_value
 
@@ -490,19 +501,21 @@ cdef class WeightedMedianHeap:
         cdef double current_median
 
         # trivial case of one element.
-        if self.current_capacity == 1:
+        if self.size() == 1:
             self.k = 1
+            self.total_weight = weight
             self.sum_w_0_k = self.total_weight
             return 0
 
-        # get the current weighted median
+        # get the original weighted median
         self.get_median(&current_median)
-
+        self.total_weight += weight
         # check if the value inserted is the same as the current median
-        if data == current_median:
-            # k stays the same, but add weight to sum_w_0_k
-            self.sum_w_0_k += weight
-            return 0
+        # if data == current_median:
+        #     # k stays the same, but add weight to sum_w_0_k
+        #     self.k += 1
+        #     self.sum_w_0_k += weight
+        #     return 0
 
         if data < current_median:
             # inserting below the median, so increment k and
@@ -514,38 +527,21 @@ cdef class WeightedMedianHeap:
 
             # minimize k such that sum(W[0:k]) >= total_weight / 2
             # minimum value of k is 1
-            while(self.k != 1 and (self.sum_w_0_k - self.get_weight_from_index(self.k-1) >= self.total_weight / 2)):
+            while(self.k > 1 and (self.sum_w_0_k - self.samples.get_weight_from_index(self.k-1) >= self.total_weight / 2.0)):
                 # ordering of these statements is very important
                 self.k -= 1
-                self.sum_w_0_k -= self.get_weight_from_index(self.k)
+                self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
+
             return 0
 
-        if data > current_median:
+        if data >= current_median:
             # inserting above the median
             # minimize k such that sum(W[0:k]) >= total_weight / 2
-            while(self.k != self.current_capacity and self.sum_w_0_k < self.total_weight / 2):
+            while(self.k < self.samples.size() and self.sum_w_0_k < self.total_weight / 2.0):
                 self.k += 1
-                self.sum_w_0_k += self.get_weight_from_index(self.k-1)
+                self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
             return 0
 
-    cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil:
-        """Given an index between [0,self.current_capacity], access
-        the appropriate heap and return the requested weight"""
-        cdef DOUBLE_t value
-        cdef DOUBLE_t weight
-
-        self.samples.get_index_data(index, &value, &weight)
-        return weight
-
-    cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil:
-        """Given an index between [0,self.current_capacity], access
-            the appropriate heap and return the requested value"""
-        cdef DOUBLE_t value
-        cdef DOUBLE_t weight
-
-        self.samples.get_index_data(index, &value, &weight)
-        return value
-
     cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil:
         """Remove a value from the MedianHeap, removing it
         from consideration in the median calculation
@@ -554,8 +550,6 @@ cdef class WeightedMedianHeap:
         cdef int return_value
 
         return_value = self.samples.remove(data, weight)
-        self.current_capacity -= 1
-        self.total_weight -= weight
         self.update_median_parameters_post_remove(data, weight)
         return return_value
 
@@ -566,12 +560,10 @@ cdef class WeightedMedianHeap:
         cdef int return_value
 
         # no elements to pop
-        if self.current_capacity == 0:
+        if self.samples.size() == 0:
             return -1
 
         return_value = self.samples.pop(data, weight)
-        self.current_capacity -= 1
-        self.total_weight -= weight[0]
         self.update_median_parameters_post_remove(data[0],
                                                   weight[0])
         return return_value
@@ -581,25 +573,38 @@ cdef class WeightedMedianHeap:
         """Update the parameters used in the median calculation,
         namely `k` and `sum_w_0_k` after a removal"""
         cdef DOUBLE_t current_median
+        # reset parameters because empty
+        if self.samples.size() == 0:
+            self.k = 0
+            self.total_weight = 0
+            self.sum_w_0_k = 0
+            return 0
+
         # trivial case of one element.
-        if self.current_capacity == 1:
+        if self.samples.size() == 1:
             self.k = 1
+            self.total_weight -= weight
             self.sum_w_0_k = self.total_weight
             return 0
 
         # get the current weighted median
         self.get_median(&current_median)
+        self.total_weight -= weight
 
         # check if the value removed is the same as the current median
-        if data == current_median:
-            # k stays the same, but remove weight from sum_w_0_k
-            self.sum_w_0_k -= weight
-            return 0
+        # if data == current_median:
+        #     # with gil:
+        #     #     print "removing at median"
+        #     # k stays the same, but remove weight from sum_w_0_k
+        #     self.sum_w_0_k -= weight
+        #     return 0
 
         if data < current_median:
             # removing below the median, so decrement k and
             # then update self.sum_w_0_k accordingly by subtracting
             # the removed weight
+            # with gil:
+            #     print "removing below median"
             self.k -= 1
             # update sum_w_0_k by removing the weight at index k
             self.sum_w_0_k -= weight
@@ -607,30 +612,36 @@ cdef class WeightedMedianHeap:
             # minimize k such that sum(W[0:k]) >= total_weight / 2
             # by incrementing k and updating sum_w_0_k accordingly
             # until the condition is met.
-            while(self.k != self.current_capacity and self.sum_w_0_k < self.total_weight / 2):
+            while(self.k < self.samples.size() and self.sum_w_0_k < self.total_weight / 2.0):
                 # ordering of these statements is very important
                 self.k += 1
-                self.sum_w_0_k += self.get_weight_from_index(self.k-1)
+                self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
             return 0
 
-        if data > current_median:
+        if data >= current_median:
             # removing above the median
+            # with gil:
+            #     print "removing above median"
             # minimize k such that sum(W[0:k]) >= total_weight / 2
-            while(self.k != 1 and self.sum_w_0_k - self.get_weight_from_index(self.k-1) >= self.total_weight / 2):
+            while(self.k > 1 and self.sum_w_0_k - self.samples.get_weight_from_index(self.k-1) >= self.total_weight / 2.0):
                 # mind the ordering
                 self.k -= 1
-                self.sum_w_0_k -= self.get_weight_from_index(self.k)
+                self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
             return 0
 
     cdef int get_median(self, double* median) nogil:
         """Write the median to a pointer, taking into account
         sample weights."""
+        # with gil:
+        #     print "entered get_median"
         if self.sum_w_0_k < (self.total_weight / 2.0):
+            # with gil:
+            #     raise ValueError()
             return -1
         if self.sum_w_0_k == (self.total_weight / 2.0):
             # split median
-            median[0] = (self.get_value_from_index(self.k) + self.get_value_from_index(self.k-1))/2
+            median[0] = (<double> (self.samples.get_value_from_index(self.k) + self.samples.get_value_from_index(self.k-1)) / 2.0)
         if self.sum_w_0_k > (self.total_weight / 2.0):
             # whole median
-            median[0] = self.get_value_from_index(self.k-1)
+            median[0] = self.samples.get_value_from_index(self.k-1)
         return 0

From 53207d48245c3424042c15e52fed7d9338f34221 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 12:11:47 -0500
Subject: [PATCH 43/75] spurious comment to force recythonization

---
 sklearn/tree/_splitter.pyx | 2 +-
 sklearn/tree/_tree.pyx     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 0617508aab236..b449f5aecbd8b 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -35,7 +35,7 @@ from ._utils cimport RAND_R_MAX
 from ._utils cimport safe_realloc
 
 cdef double INFINITY = np.inf
-
+# SPURIOUS COMMENT TO FORCE RECYTHONIZE. REMOVE ME.
 # Mitigate precision differences between 32 bit and 64 bit
 cdef DTYPE_t FEATURE_THRESHOLD = 1e-7
 
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index f44320a7b47ae..38eb4815ff5e1 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -36,7 +36,7 @@ from ._utils cimport PriorityHeap
 from ._utils cimport PriorityHeapRecord
 from ._utils cimport safe_realloc
 from ._utils cimport sizet_ptr_to_ndarray
-
+# SPURIOUS COMMENT TO FORCE RECYTHONIZE. REMOVE ME.
 cdef extern from "numpy/arrayobject.h":
     object PyArray_NewFromDescr(object subtype, np.dtype descr,
                                 int nd, np.npy_intp* dims,

From 69072274d66871124f32840624715b178cd3d8df Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 12:39:29 -0500
Subject: [PATCH 44/75] general code cleanup

---
 sklearn/tree/_splitter.pyx |  2 +-
 sklearn/tree/_tree.pyx     |  4 +-
 sklearn/tree/_utils.pxd    |  4 +-
 sklearn/tree/_utils.pyx    | 83 +++++++++++++++-----------------------
 4 files changed, 37 insertions(+), 56 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index b449f5aecbd8b..0617508aab236 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -35,7 +35,7 @@ from ._utils cimport RAND_R_MAX
 from ._utils cimport safe_realloc
 
 cdef double INFINITY = np.inf
-# SPURIOUS COMMENT TO FORCE RECYTHONIZE. REMOVE ME.
+
 # Mitigate precision differences between 32 bit and 64 bit
 cdef DTYPE_t FEATURE_THRESHOLD = 1e-7
 
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 38eb4815ff5e1..0dd1a6c92083b 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1,4 +1,4 @@
-# cython: cdivision=True
+# cython: cdivision=Truex
 # cython: boundscheck=False
 # cython: wraparound=False
 
@@ -36,7 +36,7 @@ from ._utils cimport PriorityHeap
 from ._utils cimport PriorityHeapRecord
 from ._utils cimport safe_realloc
 from ._utils cimport sizet_ptr_to_ndarray
-# SPURIOUS COMMENT TO FORCE RECYTHONIZE. REMOVE ME.
+
 cdef extern from "numpy/arrayobject.h":
     object PyArray_NewFromDescr(object subtype, np.dtype descr,
                                 int nd, np.npy_intp* dims,
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 2f31f915a3191..fe803ba2fe779 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -141,8 +141,8 @@ cdef class WeightedMedianHeap:
     cdef WeightedPQueue samples
     cdef DOUBLE_t total_weight
     cdef SIZE_t k
-    cdef DOUBLE_t sum_w_0_k # represents sum(weights[0:k])
-                            # = w[0] + w[1] + ... + w[k-1]
+    cdef DOUBLE_t sum_w_0_k            # represents sum(weights[0:k])
+                                       # = w[0] + w[1] + ... + w[k-1]
 
     cdef SIZE_t size(self) nogil
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 7cd536080bc77..c38196b38cab9 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -377,7 +377,8 @@ cdef class WeightedPQueue:
         return 0
 
     cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil:
-        """Remove a specific value from array"""
+        """Remove a specific value/weight record from the array.
+        Returns 0 if successful, -1 if record not found."""
         cdef SIZE_t array_ptr = self.array_ptr
         cdef WeightedPQueueRecord* array = self.array_
         cdef SIZE_t idx_to_remove = -1
@@ -392,13 +393,11 @@ cdef class WeightedPQueue:
                 idx_to_remove = i
                 break
 
-        # should we throw an error if the element isn't found?
-        # it shouldn't happen, but better to fail noisily...?
         if idx_to_remove == -1:
-            with gil:
-                raise ValueError()
+            return -1
 
-        # move after the removed element over by one
+        # shift the elements after the removed element
+        # to the left.
         for i in range(idx_to_remove, array_ptr-1):
             array[i] = array[i+1]
 
@@ -406,7 +405,8 @@ cdef class WeightedPQueue:
         return 0
 
     cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil:
-        """Remove top element from array."""
+        """Remove the top (minimum) element from array.
+        Returns 0 if successful, -1 if nothing to remove."""
         cdef SIZE_t array_ptr = self.array_ptr
         cdef WeightedPQueueRecord* array = self.array_
         cdef SIZE_t i
@@ -414,11 +414,11 @@ cdef class WeightedPQueue:
         if array_ptr <= 0:
             return -1
 
-        # Take first element
         data[0] = array[0].data
         weight[0] = array[0].weight
 
-        # move after the removed element over by one
+        # shift the elements after the removed element
+        # to the left.
         for i in range(0, array_ptr-1):
             array[i] = array[i+1]
 
@@ -426,7 +426,8 @@ cdef class WeightedPQueue:
         return 0
 
     cdef int peek(self, DOUBLE_t* data, DOUBLE_t* weight) nogil:
-        """Write the top element from array to a pointer."""
+        """Write the top element from array to a pointer.
+        Returns 0 if successful, -1 if nothing to write."""
         cdef SIZE_t array_ptr = self.array_ptr
         cdef WeightedPQueueRecord* array = self.array_
         if array_ptr <= 0:
@@ -444,11 +445,9 @@ cdef class WeightedPQueue:
 
         if array_ptr <= 0 or index >= array_ptr:
             with gil:
-                print index
-                print array_ptr
-                print "FALIED ON WEIGHT"
-            return -1
-        # Take weight at idx
+                raise ValueError("Tried to access element "
+                                 "at index out of bounds.")
+        # get weight at index
         return array[index].weight
 
     cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil:
@@ -459,11 +458,9 @@ cdef class WeightedPQueue:
 
         if array_ptr <= 0 or index >= array_ptr:
             with gil:
-                print index
-                print array_ptr
-                print "FALIED ON VALUE"
-            return -1
-        # Take value at idx
+                raise ValueError("Tried to access element "
+                                 "at index out of bounds.")
+        # get value at index
         return array[index].data
 
 # =============================================================================
@@ -480,6 +477,7 @@ cdef class WeightedMedianHeap:
         self.sum_w_0_k = 0
 
     cdef SIZE_t size(self) nogil:
+        """Return the number of samples in the WeightedMedianHeap"""
         return self.samples.size()
 
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil:
@@ -510,12 +508,6 @@ cdef class WeightedMedianHeap:
         # get the original weighted median
         self.get_median(&current_median)
         self.total_weight += weight
-        # check if the value inserted is the same as the current median
-        # if data == current_median:
-        #     # k stays the same, but add weight to sum_w_0_k
-        #     self.k += 1
-        #     self.sum_w_0_k += weight
-        #     return 0
 
         if data < current_median:
             # inserting below the median, so increment k and
@@ -527,17 +519,19 @@ cdef class WeightedMedianHeap:
 
             # minimize k such that sum(W[0:k]) >= total_weight / 2
             # minimum value of k is 1
-            while(self.k > 1 and (self.sum_w_0_k - self.samples.get_weight_from_index(self.k-1) >= self.total_weight / 2.0)):
-                # ordering of these statements is very important
+            while(self.k > 1 and (self.sum_w_0_k -
+                                  self.samples.get_weight_from_index(self.k-1)
+                                  >= self.total_weight / 2.0)):
                 self.k -= 1
                 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
 
             return 0
 
         if data >= current_median:
-            # inserting above the median
+            # inserting above or at the median
             # minimize k such that sum(W[0:k]) >= total_weight / 2
-            while(self.k < self.samples.size() and self.sum_w_0_k < self.total_weight / 2.0):
+            while(self.k < self.samples.size() and
+                  (self.sum_w_0_k < self.total_weight / 2.0)):
                 self.k += 1
                 self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
             return 0
@@ -591,20 +585,11 @@ cdef class WeightedMedianHeap:
         self.get_median(&current_median)
         self.total_weight -= weight
 
-        # check if the value removed is the same as the current median
-        # if data == current_median:
-        #     # with gil:
-        #     #     print "removing at median"
-        #     # k stays the same, but remove weight from sum_w_0_k
-        #     self.sum_w_0_k -= weight
-        #     return 0
-
         if data < current_median:
             # removing below the median, so decrement k and
             # then update self.sum_w_0_k accordingly by subtracting
             # the removed weight
-            # with gil:
-            #     print "removing below median"
+
             self.k -= 1
             # update sum_w_0_k by removing the weight at index k
             self.sum_w_0_k -= weight
@@ -612,19 +597,18 @@ cdef class WeightedMedianHeap:
             # minimize k such that sum(W[0:k]) >= total_weight / 2
             # by incrementing k and updating sum_w_0_k accordingly
             # until the condition is met.
-            while(self.k < self.samples.size() and self.sum_w_0_k < self.total_weight / 2.0):
-                # ordering of these statements is very important
+            while(self.k < self.samples.size() and
+                  (self.sum_w_0_k < self.total_weight / 2.0)):
                 self.k += 1
                 self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
             return 0
 
         if data >= current_median:
             # removing above the median
-            # with gil:
-            #     print "removing above median"
             # minimize k such that sum(W[0:k]) >= total_weight / 2
-            while(self.k > 1 and self.sum_w_0_k - self.samples.get_weight_from_index(self.k-1) >= self.total_weight / 2.0):
-                # mind the ordering
+            while(self.k > 1 and ((self.sum_w_0_k -
+                                  self.samples.get_weight_from_index(self.k-1))
+                                  >= self.total_weight / 2.0)):
                 self.k -= 1
                 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
             return 0
@@ -632,15 +616,12 @@ cdef class WeightedMedianHeap:
     cdef int get_median(self, double* median) nogil:
         """Write the median to a pointer, taking into account
         sample weights."""
-        # with gil:
-        #     print "entered get_median"
         if self.sum_w_0_k < (self.total_weight / 2.0):
-            # with gil:
-            #     raise ValueError()
             return -1
         if self.sum_w_0_k == (self.total_weight / 2.0):
             # split median
-            median[0] = (<double> (self.samples.get_value_from_index(self.k) + self.samples.get_value_from_index(self.k-1)) / 2.0)
+            median[0] = (self.samples.get_value_from_index(self.k) +
+                         self.samples.get_value_from_index(self.k-1)) / 2.0
         if self.sum_w_0_k > (self.total_weight / 2.0):
             # whole median
             median[0] = self.samples.get_value_from_index(self.k-1)

From 8d550979933e2eabcf192a44612d46e06845446e Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 14:02:38 -0500
Subject: [PATCH 45/75] fix typo in _tree.pyx

---
 sklearn/tree/_tree.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 0dd1a6c92083b..f44320a7b47ae 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1,4 +1,4 @@
-# cython: cdivision=Truex
+# cython: cdivision=True
 # cython: boundscheck=False
 # cython: wraparound=False
 

From b465abc06ad605565f6e93a0a8bbd4a61bcb0649 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 14:49:30 -0500
Subject: [PATCH 46/75] removed some extraneous comments

---
 sklearn/tree/_criterion.pyx | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 2367bcb5e1c58..0ae7163770bf8 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1220,10 +1220,6 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t start = self.start
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
-        # with gil:
-        #     print "start {}".format(start)
-        #     print "pos {}".format(pos)
-        #     print "end {}".format(end)
 
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
@@ -1237,8 +1233,6 @@ cdef class MAE(RegressionCriterion):
 
         for k in range(self.n_outputs):
             (<WeightedMedianHeap> left_child_heaps[k]).get_median(&median)
-            # with gil:
-                # print "median {}".format(median)
             for p in range(start, pos):
                 i = samples[p]
 
@@ -1246,13 +1240,9 @@ cdef class MAE(RegressionCriterion):
 
                 impurity_left[0] += <double>fabs((<double> y_ik) - median)
         impurity_left[0] /= <double>((pos - start) * self.n_outputs)
-        # with gil:
-            # print "impurity_left[0] {}".format(impurity_left[0])
 
         for k in range(self.n_outputs):
             (<WeightedMedianHeap> right_child_heaps[k]).get_median(&median)
-            # with gil:
-                # print "median {}".format(median)
             for p in range(pos, end):
                 i = samples[p]
 
@@ -1260,8 +1250,6 @@ cdef class MAE(RegressionCriterion):
 
                 impurity_right[0] += <double>fabs((<double> y_ik) - median)
         impurity_right[0] /= <double>((end - pos) * self.n_outputs)
-        # with gil:
-            # print "impurity_right[0] {}".format(impurity_right[0])
 
 
 cdef class FriedmanMSE(MSE):

From df9e64a5456215c2240e4b6ada0bd318b85a2293 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 16:38:41 -0500
Subject: [PATCH 47/75] [ci skip] remove earlier microchanges

---
 sklearn/tree/_criterion.pyx | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 0ae7163770bf8..ed037dd3aa57d 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -814,7 +814,6 @@ cdef class RegressionCriterion(Criterion):
         cdef SIZE_t k
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t y_ik
-        cdef DOUBLE_t w_y_ik
 
         # Update statistics up to new_pos
         #
@@ -833,8 +832,7 @@ cdef class RegressionCriterion(Criterion):
 
                 for k in range(self.n_outputs):
                     y_ik = y[i * self.y_stride + k]
-                    w_y_ik = w * y_ik
-                    sum_left[k] += w_y_ik
+                    sum_left[k] += w * y_ik
 
                 self.weighted_n_left += w
         else:
@@ -848,8 +846,7 @@ cdef class RegressionCriterion(Criterion):
 
                 for k in range(self.n_outputs):
                     y_ik = y[i * self.y_stride + k]
-                    w_y_ik = w * y_ik
-                    sum_left[k] -= w_y_ik
+                    sum_left[k] -= w * y_ik
 
                 self.weighted_n_left -= w
 

From 32c1fefcef3c0c1a154c08731040236304e85a72 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 16:41:11 -0500
Subject: [PATCH 48/75] [ci skip] remove change to priorityheap

---
 sklearn/tree/_utils.pyx | 65 +++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index c38196b38cab9..2c1b3fb2b79d1 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -173,36 +173,6 @@ cdef class Stack:
 # PriorityHeap data structure
 # =============================================================================
 
-cdef class PriorityHeap:
-    """A priority queue implemented as a binary heap.
-
-    The heap invariant is that the impurity improvement of the parent record
-    is larger then the impurity improvement of the children.
-
-    Attributes
-    ----------
-    capacity : SIZE_t
-        The capacity of the heap
-
-    heap_ptr : SIZE_t
-        The water mark of the heap; the heap grows from left to right in the
-        array ``heap_``. The following invariant holds ``heap_ptr < capacity``.
-
-    heap_ : PriorityHeapRecord*
-        The array of heap records. The maximum element is on the left;
-        the heap grows from left to right
-    """
-
-    def __cinit__(self, SIZE_t capacity):
-        self.capacity = capacity
-        self.heap_ptr = 0
-        self.heap_ = <PriorityHeapRecord*> malloc(capacity * sizeof(PriorityHeapRecord))
-        if self.heap_ == NULL:
-            raise MemoryError()
-
-    def __dealloc__(self):
-        free(self.heap_)
-
     cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil:
         """Restore heap invariant parent.improvement > child.improvement from
            ``pos`` upwards. """
@@ -235,6 +205,37 @@ cdef class PriorityHeap:
             heap[pos], heap[largest] = heap[largest], heap[pos]
             self.heapify_down(heap, largest, heap_length)
 
+
+cdef class PriorityHeap:
+    """A priority queue implemented as a binary heap.
+
+    The heap invariant is that the impurity improvement of the parent record
+    is larger then the impurity improvement of the children.
+
+    Attributes
+    ----------
+    capacity : SIZE_t
+        The capacity of the heap
+
+    heap_ptr : SIZE_t
+        The water mark of the heap; the heap grows from left to right in the
+        array ``heap_``. The following invariant holds ``heap_ptr < capacity``.
+
+    heap_ : PriorityHeapRecord*
+        The array of heap records. The maximum element is on the left;
+        the heap grows from left to right
+    """
+
+    def __cinit__(self, SIZE_t capacity):
+        self.capacity = capacity
+        self.heap_ptr = 0
+        self.heap_ = <PriorityHeapRecord*> malloc(capacity * sizeof(PriorityHeapRecord))
+        if self.heap_ == NULL:
+            raise MemoryError()
+
+    def __dealloc__(self):
+        free(self.heap_)
+
     cdef bint is_empty(self) nogil:
         return self.heap_ptr <= 0
 
@@ -274,7 +275,7 @@ cdef class PriorityHeap:
         heap[heap_ptr].improvement = improvement
 
         # Heapify up
-        self.heapify_up(heap, heap_ptr)
+        heapify_up(heap, heap_ptr)
 
         # Increase element count
         self.heap_ptr = heap_ptr + 1
@@ -296,7 +297,7 @@ cdef class PriorityHeap:
 
         # Restore heap invariant
         if heap_ptr > 1:
-            self.heapify_down(heap, 0, heap_ptr - 1)
+            heapify_down(heap, 0, heap_ptr - 1)
 
         self.heap_ptr = heap_ptr - 1
 

From 5e2cd1a232f0b30864993ea21532af823c98a76e Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 16:42:48 -0500
Subject: [PATCH 49/75] [ci skip] fix indentation

---
 sklearn/tree/_utils.pyx | 62 ++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 2c1b3fb2b79d1..21756f89f50af 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -173,37 +173,37 @@ cdef class Stack:
 # PriorityHeap data structure
 # =============================================================================
 
-    cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil:
-        """Restore heap invariant parent.improvement > child.improvement from
-           ``pos`` upwards. """
-        if pos == 0:
-            return
-
-        cdef SIZE_t parent_pos = (pos - 1) / 2
-
-        if heap[parent_pos].improvement < heap[pos].improvement:
-            heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
-            self.heapify_up(heap, parent_pos)
-
-    cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos,
-                           SIZE_t heap_length) nogil:
-        """Restore heap invariant parent.improvement > children.improvement from
-           ``pos`` downwards. """
-        cdef SIZE_t left_pos = 2 * (pos + 1) - 1
-        cdef SIZE_t right_pos = 2 * (pos + 1)
-        cdef SIZE_t largest = pos
-
-        if (left_pos < heap_length and
-                heap[left_pos].improvement > heap[largest].improvement):
-            largest = left_pos
-
-        if (right_pos < heap_length and
-                heap[right_pos].improvement > heap[largest].improvement):
-            largest = right_pos
-
-        if largest != pos:
-            heap[pos], heap[largest] = heap[largest], heap[pos]
-            self.heapify_down(heap, largest, heap_length)
+cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil:
+    """Restore heap invariant parent.improvement > child.improvement from
+       ``pos`` upwards. """
+    if pos == 0:
+        return
+
+    cdef SIZE_t parent_pos = (pos - 1) / 2
+
+    if heap[parent_pos].improvement < heap[pos].improvement:
+        heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
+        self.heapify_up(heap, parent_pos)
+
+cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos,
+                       SIZE_t heap_length) nogil:
+    """Restore heap invariant parent.improvement > children.improvement from
+       ``pos`` downwards. """
+    cdef SIZE_t left_pos = 2 * (pos + 1) - 1
+    cdef SIZE_t right_pos = 2 * (pos + 1)
+    cdef SIZE_t largest = pos
+
+    if (left_pos < heap_length and
+            heap[left_pos].improvement > heap[largest].improvement):
+        largest = left_pos
+
+    if (right_pos < heap_length and
+            heap[right_pos].improvement > heap[largest].improvement):
+        largest = right_pos
+
+    if largest != pos:
+        heap[pos], heap[largest] = heap[largest], heap[pos]
+        self.heapify_down(heap, largest, heap_length)
 
 
 cdef class PriorityHeap:

From 9f1b5fd13b406f4725cdfaff410f87a220f56eab Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 16:44:53 -0500
Subject: [PATCH 50/75] [ci skip] fix class-specific issues with heaps

---
 sklearn/tree/_utils.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 21756f89f50af..a327cb705b436 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -173,7 +173,7 @@ cdef class Stack:
 # PriorityHeap data structure
 # =============================================================================
 
-cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil:
+cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil:
     """Restore heap invariant parent.improvement > child.improvement from
        ``pos`` upwards. """
     if pos == 0:
@@ -183,9 +183,9 @@ cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil:
 
     if heap[parent_pos].improvement < heap[pos].improvement:
         heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
-        self.heapify_up(heap, parent_pos)
+        heapify_up(heap, parent_pos)
 
-cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos,
+cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos,
                        SIZE_t heap_length) nogil:
     """Restore heap invariant parent.improvement > children.improvement from
        ``pos`` downwards. """
@@ -203,7 +203,7 @@ cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos,
 
     if largest != pos:
         heap[pos], heap[largest] = heap[largest], heap[pos]
-        self.heapify_down(heap, largest, heap_length)
+        heapify_down(heap, largest, heap_length)
 
 
 cdef class PriorityHeap:

From 802e1fdd4f6abe2c0f73163dec51f2b0448115fe Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 16:45:44 -0500
Subject: [PATCH 51/75] [ci skip] restore a newline

---
 sklearn/tree/_utils.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index a327cb705b436..ee0aea55eb08b 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -185,6 +185,7 @@ cdef void heapify_up(PriorityHeapRecord* heap, SIZE_t pos) nogil:
         heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos]
         heapify_up(heap, parent_pos)
 
+
 cdef void heapify_down(PriorityHeapRecord* heap, SIZE_t pos,
                        SIZE_t heap_length) nogil:
     """Restore heap invariant parent.improvement > children.improvement from

From c0401a5e7fb64c701120964d0bf1c9084244b9d9 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 16:47:35 -0500
Subject: [PATCH 52/75] [ci skip] remove microchange to refactor later

---
 sklearn/tree/_criterion.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index ed037dd3aa57d..bd29b9b4fcc11 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -960,7 +960,7 @@ cdef class MSE(RegressionCriterion):
 
         for k in range(self.n_outputs):
             impurity_left[0] -= (sum_left[k] / self.weighted_n_left) ** 2.0
-            impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0
+            impurity_right[0] -= (sum_right[k] / self.weighted_n_right) ** 2.0 
 
         impurity_left[0] /= self.n_outputs
         impurity_right[0] /= self.n_outputs

From 0bfc2c32e0f584d51bce6e53f629fb60823b5047 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 17:13:00 -0500
Subject: [PATCH 53/75] reword a comment

---
 sklearn/tree/_criterion.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index bd29b9b4fcc11..69da8258675fc 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1149,7 +1149,7 @@ cdef class MAE(RegressionCriterion):
 
                 for k in range(self.n_outputs):
                     y_ik = y[i * self.y_stride + k]
-                    # remove y_ik with weight w from right and add to left
+                    # remove y_ik and its weight w from right and add to left
                     (<WeightedMedianHeap> right_child_heaps[k]).remove(y_ik, w)
                     (<WeightedMedianHeap> left_child_heaps[k]).push(y_ik, w)
 

From 702bb6bb70a1fc78cb3b65e44148f2e48f8ecd38 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Sun, 17 Jul 2016 17:18:31 -0500
Subject: [PATCH 54/75] remove heapify methods from queue class

---
 sklearn/tree/_utils.pxd | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index fe803ba2fe779..f27bb436f1a36 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -104,9 +104,6 @@ cdef class PriorityHeap:
                   double impurity, double impurity_left,
                   double impurity_right) nogil
     cdef int pop(self, PriorityHeapRecord* res) nogil
-    cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil
-    cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos,
-                           SIZE_t heap_length) nogil
 
 # =============================================================================
 # WeightedPQueue data structure

From 327ea19625e849092538e9d57aed737302f73817 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Mon, 18 Jul 2016 05:42:48 -0500
Subject: [PATCH 55/75] doc: update docstrings for dt, rf, and et regressors

---
 sklearn/ensemble/forest.py  | 12 ++++++++----
 sklearn/tree/_criterion.pyx |  2 +-
 sklearn/tree/tree.py        |  8 ++++----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
index e26323f65bfee..f4680071488f0 100644
--- a/sklearn/ensemble/forest.py
+++ b/sklearn/ensemble/forest.py
@@ -947,8 +947,10 @@ class RandomForestRegressor(ForestRegressor):
         The number of trees in the forest.
 
     criterion : string, optional (default="mse")
-        The function to measure the quality of a split. The only supported
-        criterion is "mse" for the mean squared error.
+        The function to measure the quality of a split. Supported criteria
+        are "mse" for the mean squared error, which is equal to variance
+        reduction as feature selection criterion, and "mae" for the mean
+        absolute error.
 
     max_features : int, float, string or None, optional (default="auto")
         The number of features to consider when looking for the best split:
@@ -1299,8 +1301,10 @@ class ExtraTreesRegressor(ForestRegressor):
         The number of trees in the forest.
 
     criterion : string, optional (default="mse")
-        The function to measure the quality of a split. The only supported
-        criterion is "mse" for the mean squared error.
+        The function to measure the quality of a split. Supported criteria
+        are "mse" for the mean squared error, which is equal to variance
+        reduction as feature selection criterion, and "mae" for the mean
+        absolute error.
 
     max_features : int, float, string or None, optional (default="auto")
         The number of features to consider when looking for the best split:
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 69da8258675fc..9971ac6fe2cd2 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -725,7 +725,7 @@ cdef class RegressionCriterion(Criterion):
         self.sum_left = <double*> calloc(n_outputs, sizeof(double))
         self.sum_right = <double*> calloc(n_outputs, sizeof(double))
 
-        if (self.sum_total == NULL or 
+        if (self.sum_total == NULL or
                 self.sum_left == NULL or
                 self.sum_right == NULL):
             raise MemoryError()
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index 4f8ebf9e960ed..deca4c7730754 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -783,10 +783,10 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
     Parameters
     ----------
     criterion : string, optional (default="mse")
-        The function to measure the quality of a split. Supported
-        criterions are "mse" for the mean squared error, which is 
-        equal to variance reduction as feature selection criterion, 
-        and "mae" for the mean absolute deviation.
+        The function to measure the quality of a split. Supported criteria
+        are "mse" for the mean squared error, which is equal to variance
+        reduction as feature selection criterion, and "mae" for the mean
+        absolute error.
 
     splitter : string, optional (default="best")
         The strategy used to choose the split at each node. Supported

From 469274d4371bc43b4145b3ed84084df2e33e7039 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Mon, 18 Jul 2016 07:23:37 -0500
Subject: [PATCH 56/75] doc: revert incorrect spacing to shorten diff

---
 sklearn/tree/_criterion.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 9971ac6fe2cd2..0dbd837330427 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -725,7 +725,7 @@ cdef class RegressionCriterion(Criterion):
         self.sum_left = <double*> calloc(n_outputs, sizeof(double))
         self.sum_right = <double*> calloc(n_outputs, sizeof(double))
 
-        if (self.sum_total == NULL or
+        if (self.sum_total == NULL or 
                 self.sum_left == NULL or
                 self.sum_right == NULL):
             raise MemoryError()
@@ -1165,7 +1165,7 @@ cdef class MAE(RegressionCriterion):
 
                 for k in range(self.n_outputs):
                     y_ik = y[i * self.y_stride + k]
-                    # remove y_ik from left and add to right
+                    # remove y_ik and its weight w from left and add to right
                     (<WeightedMedianHeap> left_child_heaps[k]).remove(y_ik, w)
                     (<WeightedMedianHeap> right_child_heaps[k]).push(y_ik, w)
 

From 560f6fa1879ab1a8e401344f0e7209e93e2a4843 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Mon, 18 Jul 2016 16:09:37 -0500
Subject: [PATCH 57/75] convert get_median to return value directly

---
 sklearn/tree/_criterion.pyx |  6 +++---
 sklearn/tree/_utils.pxd     |  9 ++++++---
 sklearn/tree/_utils.pyx     | 29 +++++++++++++----------------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 0dbd837330427..14beaed255c7f 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1061,7 +1061,7 @@ cdef class MAE(RegressionCriterion):
 
         # calculate the node medians
         for k in range(self.n_outputs):
-            (<WeightedMedianHeap> right_child_heaps[k]).get_median(&(self.node_medians[k]))
+            self.node_medians[k] = (<WeightedMedianHeap> right_child_heaps[k]).get_median()
 
         # Reset to pos=start
         self.reset()
@@ -1229,7 +1229,7 @@ cdef class MAE(RegressionCriterion):
         impurity_right[0] = 0.0
 
         for k in range(self.n_outputs):
-            (<WeightedMedianHeap> left_child_heaps[k]).get_median(&median)
+            median = (<WeightedMedianHeap> left_child_heaps[k]).get_median()
             for p in range(start, pos):
                 i = samples[p]
 
@@ -1239,7 +1239,7 @@ cdef class MAE(RegressionCriterion):
         impurity_left[0] /= <double>((pos - start) * self.n_outputs)
 
         for k in range(self.n_outputs):
-            (<WeightedMedianHeap> right_child_heaps[k]).get_median(&median)
+            median = (<WeightedMedianHeap> right_child_heaps[k]).get_median()
             for p in range(pos, end):
                 i = samples[p]
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index f27bb436f1a36..88d6378c0c29b 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -2,6 +2,7 @@
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Nelson Liu <nelson@nelsonliu.me>
 #
 # License: BSD 3 clause
 
@@ -143,8 +144,10 @@ cdef class WeightedMedianHeap:
 
     cdef SIZE_t size(self) nogil
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil
-    cdef int update_median_parameters_post_push(self, DOUBLE_t data, DOUBLE_t weight) nogil
+    cdef int update_median_parameters_post_push(self, DOUBLE_t data,
+                                                DOUBLE_t weight) nogil
     cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil
     cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil
-    cdef int update_median_parameters_post_remove(self, DOUBLE_t data, DOUBLE_t weight) nogil
-    cdef int get_median(self, double* median) nogil
+    cdef int update_median_parameters_post_remove(self, DOUBLE_t data,
+                                                  DOUBLE_t weight) nogil
+    cdef double get_median(self) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index ee0aea55eb08b..e4b93fa2f10bf 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -328,7 +328,8 @@ cdef class WeightedPQueue:
     def __cinit__(self, SIZE_t capacity):
         self.capacity = capacity
         self.array_ptr = 0
-        self.array_ = <WeightedPQueueRecord*> calloc(capacity, sizeof(WeightedPQueueRecord))
+        self.array_ = <WeightedPQueueRecord*> calloc(capacity,
+                                                     sizeof(WeightedPQueueRecord))
 
         if self.array_ == NULL:
             raise MemoryError()
@@ -485,9 +486,8 @@ cdef class WeightedMedianHeap:
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil:
         """Push a value and its associated weight
         to the WeightedMedianHeap to be considered
-        in the median calculation
+        in the median calculation.
         """
-        cdef double current_median
         cdef int return_value
 
         return_value = self.samples.push(data, weight)
@@ -508,7 +508,7 @@ cdef class WeightedMedianHeap:
             return 0
 
         # get the original weighted median
-        self.get_median(&current_median)
+        current_median = self.get_median()
         self.total_weight += weight
 
         if data < current_median:
@@ -521,8 +521,8 @@ cdef class WeightedMedianHeap:
 
             # minimize k such that sum(W[0:k]) >= total_weight / 2
             # minimum value of k is 1
-            while(self.k > 1 and (self.sum_w_0_k -
-                                  self.samples.get_weight_from_index(self.k-1)
+            while(self.k > 1 and ((self.sum_w_0_k -
+                                   self.samples.get_weight_from_index(self.k-1))
                                   >= self.total_weight / 2.0)):
                 self.k -= 1
                 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
@@ -568,7 +568,7 @@ cdef class WeightedMedianHeap:
                                                   DOUBLE_t weight) nogil:
         """Update the parameters used in the median calculation,
         namely `k` and `sum_w_0_k` after a removal"""
-        cdef DOUBLE_t current_median
+        cdef double current_median
         # reset parameters because empty
         if self.samples.size() == 0:
             self.k = 0
@@ -584,7 +584,7 @@ cdef class WeightedMedianHeap:
             return 0
 
         # get the current weighted median
-        self.get_median(&current_median)
+        current_median = self.get_median()
         self.total_weight -= weight
 
         if data < current_median:
@@ -609,22 +609,19 @@ cdef class WeightedMedianHeap:
             # removing above the median
             # minimize k such that sum(W[0:k]) >= total_weight / 2
             while(self.k > 1 and ((self.sum_w_0_k -
-                                  self.samples.get_weight_from_index(self.k-1))
+                                   self.samples.get_weight_from_index(self.k-1))
                                   >= self.total_weight / 2.0)):
                 self.k -= 1
                 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
             return 0
 
-    cdef int get_median(self, double* median) nogil:
+    cdef double get_median(self) nogil:
         """Write the median to a pointer, taking into account
         sample weights."""
-        if self.sum_w_0_k < (self.total_weight / 2.0):
-            return -1
         if self.sum_w_0_k == (self.total_weight / 2.0):
             # split median
-            median[0] = (self.samples.get_value_from_index(self.k) +
-                         self.samples.get_value_from_index(self.k-1)) / 2.0
+            return (self.samples.get_value_from_index(self.k) +
+                    self.samples.get_value_from_index(self.k-1)) / 2.0
         if self.sum_w_0_k > (self.total_weight / 2.0):
             # whole median
-            median[0] = self.samples.get_value_from_index(self.k-1)
-        return 0
+            return self.samples.get_value_from_index(self.k-1)

From 87b01807f6e03066b19df7dca37775d58cc8d0f3 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Mon, 18 Jul 2016 16:11:14 -0500
Subject: [PATCH 58/75] [ci skip] remove accidental whitespace

---
 sklearn/tree/_criterion.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 14beaed255c7f..40fb73b4bdb69 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1305,5 +1305,5 @@ cdef class FriedmanMSE(MSE):
         diff = (self.weighted_n_right * total_sum_left -
                 self.weighted_n_left * total_sum_right) / self.n_outputs
 
-        return (diff * diff / (self.weighted_n_left * self.weighted_n_right * 
+        return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
                                self.weighted_n_node_samples))

From ecae6754fae1a919697696043a2dfa8269b8f477 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Mon, 18 Jul 2016 20:56:18 -0700
Subject: [PATCH 59/75] remove extraneous unpacking of values

---
 sklearn/tree/_utils.pyx | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index e4b93fa2f10bf..dec793b44da9e 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -431,9 +431,8 @@ cdef class WeightedPQueue:
     cdef int peek(self, DOUBLE_t* data, DOUBLE_t* weight) nogil:
         """Write the top element from array to a pointer.
         Returns 0 if successful, -1 if nothing to write."""
-        cdef SIZE_t array_ptr = self.array_ptr
         cdef WeightedPQueueRecord* array = self.array_
-        if array_ptr <= 0:
+        if self.array_ptr <= 0:
             return -1
         # Take first value
         data[0] = array[0].data
@@ -443,10 +442,8 @@ cdef class WeightedPQueue:
     cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil:
         """Given an index between [0,self.current_capacity], access
         the appropriate heap and return the requested weight"""
-        cdef SIZE_t array_ptr = self.array_ptr
         cdef WeightedPQueueRecord* array = self.array_
-
-        if array_ptr <= 0 or index >= array_ptr:
+        if self.array_ptr <= 0 or index >= self.array_ptr:
             with gil:
                 raise ValueError("Tried to access element "
                                  "at index out of bounds.")

From 6c2835862aba0f77f4cb8d83a10176e168d6217c Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Mon, 18 Jul 2016 21:02:54 -0700
Subject: [PATCH 60/75] style: misc changes to identifiers

---
 sklearn/tree/_criterion.pyx | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 40fb73b4bdb69..5be5700c45f93 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1035,7 +1035,7 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t k
         cdef DOUBLE_t y_ik
         cdef DOUBLE_t w = 1.0
-        # Fill accumulators with MedianHeaps
+
         with gil:
             for k in range(self.n_outputs):
                 self.left_child_heaps[k] = WeightedMedianHeap(self.n_node_samples)
@@ -1071,8 +1071,8 @@ cdef class MAE(RegressionCriterion):
 
         cdef SIZE_t i
         cdef SIZE_t k
-        cdef DOUBLE_t popped_value
-        cdef DOUBLE_t popped_weight
+        cdef DOUBLE_t value
+        cdef DOUBLE_t weight
 
         cdef void** left_child_heaps = <void**> self.left_child_heaps.data
         cdef void** right_child_heaps = <void**> self.right_child_heaps.data
@@ -1088,10 +1088,10 @@ cdef class MAE(RegressionCriterion):
             # if left has no elements, it's already reset
             for i in range((<WeightedMedianHeap> left_child_heaps[k]).size()):
                 # remove everything from left and put it into right
-                (<WeightedMedianHeap> left_child_heaps[k]).pop(&popped_value,
-                                                               &popped_weight)
-                (<WeightedMedianHeap> right_child_heaps[k]).push(popped_value,
-                                                         popped_weight)
+                (<WeightedMedianHeap> left_child_heaps[k]).pop(&value,
+                                                               &weight)
+                (<WeightedMedianHeap> right_child_heaps[k]).push(value,
+                                                                 weight)
 
     cdef void reverse_reset(self) nogil:
         """Reset the criterion at pos=end."""
@@ -1100,8 +1100,8 @@ cdef class MAE(RegressionCriterion):
         self.weighted_n_left = self.weighted_n_node_samples
         self.pos = self.end
 
-        cdef DOUBLE_t popped_value
-        cdef DOUBLE_t popped_weight
+        cdef DOUBLE_t value
+        cdef DOUBLE_t weight
         cdef void** left_child_heaps = <void**> self.left_child_heaps.data
         cdef void** right_child_heaps = <void**> self.right_child_heaps.data
 
@@ -1111,10 +1111,10 @@ cdef class MAE(RegressionCriterion):
             # if right has no elements, it's already reset
             for i in range((<WeightedMedianHeap> right_child_heaps[k]).size()):
                 # remove everything from right and put it into left
-                (<WeightedMedianHeap> right_child_heaps[k]).pop(&popped_value,
-                                                                &popped_weight)
-                (<WeightedMedianHeap> left_child_heaps[k]).push(popped_value,
-                                                                popped_weight)
+                (<WeightedMedianHeap> right_child_heaps[k]).pop(&value,
+                                                                &weight)
+                (<WeightedMedianHeap> left_child_heaps[k]).push(value,
+                                                                weight)
 
     cdef void update(self, SIZE_t new_pos) nogil:
         """Updated statistics by moving samples[pos:new_pos] to the left."""

From 0db99659a216ec1e90099726edef5ace53da30a6 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Tue, 19 Jul 2016 11:46:04 -0700
Subject: [PATCH 61/75] add docstrings and more informative variable
 identifiers

---
 sklearn/tree/_criterion.pyx | 66 ++++++++++++++++++-------------------
 sklearn/tree/_utils.pxd     |  4 +--
 sklearn/tree/_utils.pyx     | 55 +++++++++++++++++++++++++------
 3 files changed, 80 insertions(+), 45 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 5be5700c45f93..2834609cff17a 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -29,7 +29,7 @@ np.import_array()
 from ._utils cimport log
 from ._utils cimport safe_realloc
 from ._utils cimport sizet_ptr_to_ndarray
-from ._utils cimport WeightedMedianHeap
+from ._utils cimport WeightedMedianCalculator
 
 cdef class Criterion:
     """Interface for impurity criteria.
@@ -971,8 +971,8 @@ cdef class MAE(RegressionCriterion):
         """Destructor."""
         free(self.node_medians)
 
-    cdef np.ndarray left_child_heaps
-    cdef np.ndarray right_child_heaps
+    cdef np.ndarray left_child
+    cdef np.ndarray right_child
     cdef double* node_medians
 
     def __cinit__(self, SIZE_t n_outputs):
@@ -1010,8 +1010,8 @@ cdef class MAE(RegressionCriterion):
         if (self.node_medians == NULL):
             raise MemoryError()
 
-        self.left_child_heaps = np.empty(n_outputs, dtype='object')
-        self.right_child_heaps = np.empty(n_outputs, dtype='object')
+        self.left_child = np.empty(n_outputs, dtype='object')
+        self.right_child = np.empty(n_outputs, dtype='object')
 
     cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
                    double weighted_n_samples, SIZE_t* samples, SIZE_t start,
@@ -1038,11 +1038,11 @@ cdef class MAE(RegressionCriterion):
 
         with gil:
             for k in range(self.n_outputs):
-                self.left_child_heaps[k] = WeightedMedianHeap(self.n_node_samples)
-                self.right_child_heaps[k] = WeightedMedianHeap(self.n_node_samples)
+                self.left_child[k] = WeightedMedianCalculator(self.n_node_samples)
+                self.right_child[k] = WeightedMedianCalculator(self.n_node_samples)
 
-        cdef void** left_child_heaps = <void**> self.left_child_heaps.data
-        cdef void** right_child_heaps = <void**> self.right_child_heaps.data
+        cdef void** left_child = <void**> self.left_child.data
+        cdef void** right_child = <void**> self.right_child.data
 
         for p in range(start, end):
             i = samples[p]
@@ -1055,13 +1055,13 @@ cdef class MAE(RegressionCriterion):
 
                 # push all values to the right side,
                 # since pos = start initially anyway
-                (<WeightedMedianHeap> right_child_heaps[k]).push(y_ik, w)
+                (<WeightedMedianCalculator> right_child[k]).push(y_ik, w)
 
             self.weighted_n_node_samples += w
 
         # calculate the node medians
         for k in range(self.n_outputs):
-            self.node_medians[k] = (<WeightedMedianHeap> right_child_heaps[k]).get_median()
+            self.node_medians[k] = (<WeightedMedianCalculator> right_child[k]).get_median()
 
         # Reset to pos=start
         self.reset()
@@ -1074,8 +1074,8 @@ cdef class MAE(RegressionCriterion):
         cdef DOUBLE_t value
         cdef DOUBLE_t weight
 
-        cdef void** left_child_heaps = <void**> self.left_child_heaps.data
-        cdef void** right_child_heaps = <void**> self.right_child_heaps.data
+        cdef void** left_child = <void**> self.left_child.data
+        cdef void** right_child = <void**> self.right_child.data
 
         self.weighted_n_left = 0.0
         self.weighted_n_right = self.weighted_n_node_samples
@@ -1086,11 +1086,11 @@ cdef class MAE(RegressionCriterion):
 
         for k in range(self.n_outputs):
             # if left has no elements, it's already reset
-            for i in range((<WeightedMedianHeap> left_child_heaps[k]).size()):
+            for i in range((<WeightedMedianCalculator> left_child[k]).size()):
                 # remove everything from left and put it into right
-                (<WeightedMedianHeap> left_child_heaps[k]).pop(&value,
+                (<WeightedMedianCalculator> left_child[k]).pop(&value,
                                                                &weight)
-                (<WeightedMedianHeap> right_child_heaps[k]).push(value,
+                (<WeightedMedianCalculator> right_child[k]).push(value,
                                                                  weight)
 
     cdef void reverse_reset(self) nogil:
@@ -1102,18 +1102,18 @@ cdef class MAE(RegressionCriterion):
 
         cdef DOUBLE_t value
         cdef DOUBLE_t weight
-        cdef void** left_child_heaps = <void**> self.left_child_heaps.data
-        cdef void** right_child_heaps = <void**> self.right_child_heaps.data
+        cdef void** left_child = <void**> self.left_child.data
+        cdef void** right_child = <void**> self.right_child.data
 
         # reverse_reset the medianheaps, right should have no elements and
         # left should have all elements.
         for k in range(self.n_outputs):
             # if right has no elements, it's already reset
-            for i in range((<WeightedMedianHeap> right_child_heaps[k]).size()):
+            for i in range((<WeightedMedianCalculator> right_child[k]).size()):
                 # remove everything from right and put it into left
-                (<WeightedMedianHeap> right_child_heaps[k]).pop(&value,
+                (<WeightedMedianCalculator> right_child[k]).pop(&value,
                                                                 &weight)
-                (<WeightedMedianHeap> left_child_heaps[k]).push(value,
+                (<WeightedMedianCalculator> left_child[k]).push(value,
                                                                 weight)
 
     cdef void update(self, SIZE_t new_pos) nogil:
@@ -1122,8 +1122,8 @@ cdef class MAE(RegressionCriterion):
         cdef double* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
 
-        cdef void** left_child_heaps = <void**> self.left_child_heaps.data
-        cdef void** right_child_heaps = <void**> self.right_child_heaps.data
+        cdef void** left_child = <void**> self.left_child.data
+        cdef void** right_child = <void**> self.right_child.data
 
         cdef DOUBLE_t* y = self.y
         cdef SIZE_t pos = self.pos
@@ -1136,7 +1136,7 @@ cdef class MAE(RegressionCriterion):
 
         # Update statistics up to new_pos
         #
-        # We are going to update right_child_heaps and left_child_heaps
+        # We are going to update right_child and left_child
         # from the direction that require the least amount of
         # computations, i.e. from pos to new_pos or from end to new_pos.
 
@@ -1150,8 +1150,8 @@ cdef class MAE(RegressionCriterion):
                 for k in range(self.n_outputs):
                     y_ik = y[i * self.y_stride + k]
                     # remove y_ik and its weight w from right and add to left
-                    (<WeightedMedianHeap> right_child_heaps[k]).remove(y_ik, w)
-                    (<WeightedMedianHeap> left_child_heaps[k]).push(y_ik, w)
+                    (<WeightedMedianCalculator> right_child[k]).remove(y_ik, w)
+                    (<WeightedMedianCalculator> left_child[k]).push(y_ik, w)
 
                 self.weighted_n_left += w
         else:
@@ -1166,8 +1166,8 @@ cdef class MAE(RegressionCriterion):
                 for k in range(self.n_outputs):
                     y_ik = y[i * self.y_stride + k]
                     # remove y_ik and its weight w from left and add to right
-                    (<WeightedMedianHeap> left_child_heaps[k]).remove(y_ik, w)
-                    (<WeightedMedianHeap> right_child_heaps[k]).push(y_ik, w)
+                    (<WeightedMedianCalculator> left_child[k]).remove(y_ik, w)
+                    (<WeightedMedianCalculator> right_child[k]).push(y_ik, w)
 
                 self.weighted_n_left -= w
 
@@ -1222,14 +1222,14 @@ cdef class MAE(RegressionCriterion):
         cdef DOUBLE_t y_ik
         cdef DOUBLE_t median
 
-        cdef void** left_child_heaps = <void**> self.left_child_heaps.data
-        cdef void** right_child_heaps = <void**> self.right_child_heaps.data
+        cdef void** left_child = <void**> self.left_child.data
+        cdef void** right_child = <void**> self.right_child.data
 
         impurity_left[0] = 0.0
         impurity_right[0] = 0.0
 
         for k in range(self.n_outputs):
-            median = (<WeightedMedianHeap> left_child_heaps[k]).get_median()
+            median = (<WeightedMedianCalculator> left_child[k]).get_median()
             for p in range(start, pos):
                 i = samples[p]
 
@@ -1239,7 +1239,7 @@ cdef class MAE(RegressionCriterion):
         impurity_left[0] /= <double>((pos - start) * self.n_outputs)
 
         for k in range(self.n_outputs):
-            median = (<WeightedMedianHeap> right_child_heaps[k]).get_median()
+            median = (<WeightedMedianCalculator> right_child[k]).get_median()
             for p in range(pos, end):
                 i = samples[p]
 
@@ -1305,5 +1305,5 @@ cdef class FriedmanMSE(MSE):
         diff = (self.weighted_n_right * total_sum_left -
                 self.weighted_n_left * total_sum_right) / self.n_outputs
 
-        return (diff * diff / (self.weighted_n_left * self.weighted_n_right *
+        return (diff * diff / (self.weighted_n_left * self.weighted_n_right * 
                                self.weighted_n_node_samples))
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 88d6378c0c29b..b07e550789f89 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -131,10 +131,10 @@ cdef class WeightedPQueue:
 
 
 # =============================================================================
-# MedianHeap data structure
+# WeightedMedianCalculator data structure
 # =============================================================================
 
-cdef class WeightedMedianHeap:
+cdef class WeightedMedianCalculator:
     cdef SIZE_t initial_capacity
     cdef WeightedPQueue samples
     cdef DOUBLE_t total_weight
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index dec793b44da9e..40d6d19c5ca85 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -314,15 +314,17 @@ cdef class WeightedPQueue:
     Attributes
     ----------
     capacity : SIZE_t
-        The capacity of the array
+        The capacity of the priority queue.
 
     array_ptr : SIZE_t
-        The water mark of the array; the array grows from left to right in the
-        array ``array_``. array_ptr is always less than capacity.
+        The water mark of the priority queue; the priority queue grows from
+        left to right in the array ``array_``. ``array_ptr`` is always
+        less than ``capacity``.
 
     array_ : WeightedPQueueRecord*
-        The array of array records. The minimum element is on the left;
-        the array grows from left to right
+        The array of priority queue records. The minimum element is on the
+        left at index 0, and the maximum element is on the right at index
+        ``array_ptr-1``.
     """
 
     def __cinit__(self, SIZE_t capacity):
@@ -464,10 +466,42 @@ cdef class WeightedPQueue:
         return array[index].data
 
 # =============================================================================
-# WeightedMedianHeap data structure
+# WeightedMedianCalculator data structure
 # =============================================================================
 
-cdef class WeightedMedianHeap:
+cdef class WeightedMedianCalculator:
+    """A class to handle calculation of the weighted median from streams of
+    data. To do so, it maintains a parameter ``k`` such that the sum of the
+    weights in the range [0,k) is greater than or equal to half of the total
+    weight. By minimizing the value of ``k`` that fulfills this constraint,
+    calculating the median is done by either taking the value of the sample
+    at index ``k-1`` of ``samples`` (samples[k-1].data) or the average of
+    the samples at index ``k-1`` and ``k`` of ``samples``
+    ((samples[k-1] + samples[k]) / 2).
+
+    Attributes
+    ----------
+    initial_capacity : SIZE_t
+        The initial capacity of the WeightedMedianCalculator.
+
+    samples : WeightedPQueue
+        Holds the samples (consisting of values and their weights) used in the
+        weighted median calculation.
+
+    total_weight : DOUBLE_t
+        The sum of the weights of items in ``samples``. Represents the total
+        weight of all samples used in the median calculation.
+
+    k : SIZE_t
+        Index used to calculate the median.
+
+    sum_w_0_k : DOUBLE_t
+        The sum of the weights from samples[0:k]. Used in the weighted
+        median calculation; minimizing the value of ``k`` such that
+        ``sum_w_0_k`` >= ``total_weight / 2`` provides a mechanism for
+        calculating the median in constant time.
+
+    """
 
     def __cinit__(self, SIZE_t initial_capacity):
         self.initial_capacity = initial_capacity
@@ -477,12 +511,13 @@ cdef class WeightedMedianHeap:
         self.sum_w_0_k = 0
 
     cdef SIZE_t size(self) nogil:
-        """Return the number of samples in the WeightedMedianHeap"""
+        """Return the number of samples in the
+        WeightedMedianCalculator"""
         return self.samples.size()
 
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil:
         """Push a value and its associated weight
-        to the WeightedMedianHeap to be considered
+        to the WeightedMedianCalculator to be considered
         in the median calculation.
         """
         cdef int return_value
@@ -566,7 +601,7 @@ cdef class WeightedMedianHeap:
         """Update the parameters used in the median calculation,
         namely `k` and `sum_w_0_k` after a removal"""
         cdef double current_median
-        # reset parameters because empty
+        # reset parameters because it there are no elements
         if self.samples.size() == 0:
             self.k = 0
             self.total_weight = 0

From e37341665f98540a33026bd45b49b19d5f4ac18d Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Tue, 19 Jul 2016 11:58:44 -0700
Subject: [PATCH 62/75] [ci skip] add trivial comments to recythonize

---
 sklearn/tree/_splitter.pyx | 2 +-
 sklearn/tree/_tree.pyx     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 0617508aab236..f44d5da2b9ede 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -14,7 +14,7 @@
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #
 # License: BSD 3 clause
-
+# trivial comment for recythonize
 from ._criterion cimport Criterion
 
 from libc.stdlib cimport free
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index f44320a7b47ae..8f052b2a69058 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -14,7 +14,7 @@
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #
 # License: BSD 3 clause
-
+# trivial comment for recythonize
 from cpython cimport Py_INCREF, PyObject
 
 from libc.stdlib cimport free

From 448bb6e518a02d45fbaab5cabdbfc639129b39c1 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Tue, 19 Jul 2016 11:59:06 -0700
Subject: [PATCH 63/75] remove trivial comments for recythonizing

---
 sklearn/tree/_splitter.pyx | 2 +-
 sklearn/tree/_tree.pyx     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index f44d5da2b9ede..0617508aab236 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -14,7 +14,7 @@
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #
 # License: BSD 3 clause
-# trivial comment for recythonize
+
 from ._criterion cimport Criterion
 
 from libc.stdlib cimport free
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 8f052b2a69058..f44320a7b47ae 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -14,7 +14,7 @@
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #
 # License: BSD 3 clause
-# trivial comment for recythonize
+
 from cpython cimport Py_INCREF, PyObject
 
 from libc.stdlib cimport free

From c44f327076dfb6b68003fb1bf986b5ae09d70083 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Tue, 19 Jul 2016 12:09:13 -0700
Subject: [PATCH 64/75] force recythonization for real this time

---
 sklearn/tree/_splitter.pyx | 2 +-
 sklearn/tree/_tree.pyx     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 0617508aab236..61e40cc81f97b 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -14,7 +14,7 @@
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #
 # License: BSD 3 clause
-
+# another trivial comment
 from ._criterion cimport Criterion
 
 from libc.stdlib cimport free
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index f44320a7b47ae..e7bf20b9a1b70 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -14,7 +14,7 @@
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #
 # License: BSD 3 clause
-
+# another trivial comment
 from cpython cimport Py_INCREF, PyObject
 
 from libc.stdlib cimport free

From 8d442cf8c595f6368b47f15f5e4f63dd92d45111 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Tue, 19 Jul 2016 13:43:00 -0700
Subject: [PATCH 65/75] remove trivial comments for recythonization

---
 sklearn/tree/_splitter.pyx | 2 +-
 sklearn/tree/_tree.pyx     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 61e40cc81f97b..0617508aab236 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -14,7 +14,7 @@
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #
 # License: BSD 3 clause
-# another trivial comment
+
 from ._criterion cimport Criterion
 
 from libc.stdlib cimport free
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index e7bf20b9a1b70..f44320a7b47ae 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -14,7 +14,7 @@
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #
 # License: BSD 3 clause
-# another trivial comment
+
 from cpython cimport Py_INCREF, PyObject
 
 from libc.stdlib cimport free

From a0085380be5aa73c046f8dfc30e037d813287fce Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 20 Jul 2016 08:38:33 -0700
Subject: [PATCH 66/75] rfc: harmonize arg. names and remove unnecessary checks

---
 sklearn/tree/_utils.pxd |  4 ++--
 sklearn/tree/_utils.pyx | 14 +++-----------
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index b07e550789f89..45ce0f56acead 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -123,9 +123,9 @@ cdef class WeightedPQueue:
     cdef bint is_empty(self) nogil
     cdef SIZE_t size(self) nogil
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil
-    cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil
+    cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil
     cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil
-    cdef int peek(self, DOUBLE_t* res, DOUBLE_t* weight) nogil
+    cdef int peek(self, DOUBLE_t* data, DOUBLE_t* weight) nogil
     cdef DOUBLE_t get_weight_from_index(self, SIZE_t index) nogil
     cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil
 
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 40d6d19c5ca85..10b2adcfab560 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -381,7 +381,7 @@ cdef class WeightedPQueue:
         self.array_ptr = array_ptr + 1
         return 0
 
-    cdef int remove(self, DOUBLE_t value, DOUBLE_t weight) nogil:
+    cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil:
         """Remove a specific value/weight record from the array.
         Returns 0 if successful, -1 if record not found."""
         cdef SIZE_t array_ptr = self.array_ptr
@@ -394,7 +394,7 @@ cdef class WeightedPQueue:
 
         # find element to remove
         for i in range(array_ptr):
-            if array[i].data == value and array[i].weight == weight:
+            if array[i].data == data and array[i].weight == weight:
                 idx_to_remove = i
                 break
 
@@ -445,23 +445,15 @@ cdef class WeightedPQueue:
         """Given an index between [0,self.current_capacity], access
         the appropriate heap and return the requested weight"""
         cdef WeightedPQueueRecord* array = self.array_
-        if self.array_ptr <= 0 or index >= self.array_ptr:
-            with gil:
-                raise ValueError("Tried to access element "
-                                 "at index out of bounds.")
+
         # get weight at index
         return array[index].weight
 
     cdef DOUBLE_t get_value_from_index(self, SIZE_t index) nogil:
         """Given an index between [0,self.current_capacity], access
         the appropriate heap and return the requested value"""
-        cdef SIZE_t array_ptr = self.array_ptr
         cdef WeightedPQueueRecord* array = self.array_
 
-        if array_ptr <= 0 or index >= array_ptr:
-            with gil:
-                raise ValueError("Tried to access element "
-                                 "at index out of bounds.")
         # get value at index
         return array[index].data
 

From 929153c8925354845af4fcfb878f92499face390 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 20 Jul 2016 09:03:10 -0700
Subject: [PATCH 67/75] convert allocations to safe_realloc

---
 sklearn/tree/_criterion.pyx | 10 +++++-----
 sklearn/tree/_utils.pxd     |  2 ++
 sklearn/tree/_utils.pyx     |  4 +---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 2834609cff17a..a0fe837bf6d2f 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -973,7 +973,7 @@ cdef class MAE(RegressionCriterion):
 
     cdef np.ndarray left_child
     cdef np.ndarray right_child
-    cdef double* node_medians
+    cdef DOUBLE_t* node_medians
 
     def __cinit__(self, SIZE_t n_outputs):
         """Initialize parameters for this criterion.
@@ -1005,7 +1005,7 @@ cdef class MAE(RegressionCriterion):
         self.node_medians = NULL
 
         # Allocate memory for the accumulators
-        self.node_medians = <double*> calloc(n_outputs, sizeof(double))
+        safe_realloc(&self.node_medians, n_outputs)
 
         if (self.node_medians == NULL):
             raise MemoryError()
@@ -1119,7 +1119,7 @@ cdef class MAE(RegressionCriterion):
     cdef void update(self, SIZE_t new_pos) nogil:
         """Updated statistics by moving samples[pos:new_pos] to the left."""
 
-        cdef double* sample_weight = self.sample_weight
+        cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
 
         cdef void** left_child = <void**> self.left_child.data
@@ -1180,7 +1180,7 @@ cdef class MAE(RegressionCriterion):
 
         cdef SIZE_t k
         for k in range(self.n_outputs):
-            dest[k] = self.node_medians[k]
+            dest[k] = <double> self.node_medians[k]
 
     cdef double node_impurity(self) nogil:
         """Evaluate the impurity of the current node, i.e. the impurity of
@@ -1201,7 +1201,7 @@ cdef class MAE(RegressionCriterion):
 
                 y_ik = y[i * self.y_stride + k]
 
-                impurity += <double> fabs((<double> y_ik) - self.node_medians[k])
+                impurity += <double> fabs((<double> y_ik) - <double> self.node_medians[k])
         return impurity / (self.weighted_n_node_samples * self.n_outputs)
 
     cdef void children_impurity(self, double* impurity_left,
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 45ce0f56acead..1b2eb0e76d39f 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -34,6 +34,8 @@ ctypedef fused realloc_ptr:
     (DTYPE_t*)
     (SIZE_t*)
     (unsigned char*)
+    (WeightedPQueueRecord*)
+    (DOUBLE_t*)
 
 cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) except *
 
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 10b2adcfab560..398ef897d1e76 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -13,7 +13,6 @@
 
 from libc.stdlib cimport free
 from libc.stdlib cimport malloc
-from libc.stdlib cimport calloc
 from libc.stdlib cimport realloc
 from libc.math cimport log as ln
 
@@ -330,8 +329,7 @@ cdef class WeightedPQueue:
     def __cinit__(self, SIZE_t capacity):
         self.capacity = capacity
         self.array_ptr = 0
-        self.array_ = <WeightedPQueueRecord*> calloc(capacity,
-                                                     sizeof(WeightedPQueueRecord))
+        safe_realloc(&self.array_, capacity)
 
         if self.array_ == NULL:
             raise MemoryError()

From f383c94cdd8f2f5045e2b1a28ceba0a509da2d22 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 20 Jul 2016 13:10:13 -0700
Subject: [PATCH 68/75] fix bug in weighted case and add tests for MAE

---
 sklearn/tree/_criterion.pyx     |  4 ++--
 sklearn/tree/_utils.pxd         |  6 +++--
 sklearn/tree/_utils.pyx         | 40 ++++++++++++++++++++-------------
 sklearn/tree/tests/test_tree.py | 13 +++++++++++
 4 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index a0fe837bf6d2f..723810775f789 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1236,7 +1236,7 @@ cdef class MAE(RegressionCriterion):
                 y_ik = y[i * self.y_stride + k]
 
                 impurity_left[0] += <double>fabs((<double> y_ik) - median)
-        impurity_left[0] /= <double>((pos - start) * self.n_outputs)
+        impurity_left[0] /= <double>((self.weighted_n_left) * self.n_outputs)
 
         for k in range(self.n_outputs):
             median = (<WeightedMedianCalculator> right_child[k]).get_median()
@@ -1246,7 +1246,7 @@ cdef class MAE(RegressionCriterion):
                 y_ik = y[i * self.y_stride + k]
 
                 impurity_right[0] += <double>fabs((<double> y_ik) - median)
-        impurity_right[0] /= <double>((end - pos) * self.n_outputs)
+        impurity_right[0] /= <double>((self.weighted_n_right) * self.n_outputs)
 
 
 cdef class FriedmanMSE(MSE):
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 1b2eb0e76d39f..25c09783c73ba 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -147,9 +147,11 @@ cdef class WeightedMedianCalculator:
     cdef SIZE_t size(self) nogil
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil
     cdef int update_median_parameters_post_push(self, DOUBLE_t data,
-                                                DOUBLE_t weight) nogil
+                                                DOUBLE_t weight,
+                                                double original_median) nogil
     cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil
     cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil
     cdef int update_median_parameters_post_remove(self, DOUBLE_t data,
-                                                  DOUBLE_t weight) nogil
+                                                  DOUBLE_t weight,
+                                                  double original_median) nogil
     cdef double get_median(self) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 398ef897d1e76..0a98718ab9d20 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -511,16 +511,20 @@ cdef class WeightedMedianCalculator:
         in the median calculation.
         """
         cdef int return_value
+        cdef double original_median
 
+        if self.size() != 0:
+            original_median = self.get_median()
         return_value = self.samples.push(data, weight)
-        self.update_median_parameters_post_push(data, weight)
+        self.update_median_parameters_post_push(data, weight,
+                                                original_median)
         return return_value
 
     cdef int update_median_parameters_post_push(self, DOUBLE_t data,
-                                                DOUBLE_t weight) nogil:
+                                                DOUBLE_t weight,
+                                                double original_median) nogil:
         """Update the parameters used in the median calculation,
         namely `k` and `sum_w_0_k` after an insertion"""
-        cdef double current_median
 
         # trivial case of one element.
         if self.size() == 1:
@@ -530,10 +534,9 @@ cdef class WeightedMedianCalculator:
             return 0
 
         # get the original weighted median
-        current_median = self.get_median()
         self.total_weight += weight
 
-        if data < current_median:
+        if data < original_median:
             # inserting below the median, so increment k and
             # then update self.sum_w_0_k accordingly by adding
             # the weight that was added.
@@ -548,10 +551,9 @@ cdef class WeightedMedianCalculator:
                                   >= self.total_weight / 2.0)):
                 self.k -= 1
                 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
-
             return 0
 
-        if data >= current_median:
+        if data >= original_median:
             # inserting above or at the median
             # minimize k such that sum(W[0:k]) >= total_weight / 2
             while(self.k < self.samples.size() and
@@ -564,11 +566,15 @@ cdef class WeightedMedianCalculator:
         """Remove a value from the MedianHeap, removing it
         from consideration in the median calculation
         """
-        cdef double current_unweighted_median
         cdef int return_value
+        cdef double original_median
+
+        if self.size() != 0:
+            original_median = self.get_median()
 
         return_value = self.samples.remove(data, weight)
-        self.update_median_parameters_post_remove(data, weight)
+        self.update_median_parameters_post_remove(data, weight,
+                                                  original_median)
         return return_value
 
     cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil:
@@ -576,6 +582,10 @@ cdef class WeightedMedianCalculator:
         left and moving to the right.
         """
         cdef int return_value
+        cdef double original_median
+
+        if self.size() != 0:
+            original_median = self.get_median()
 
         # no elements to pop
         if self.samples.size() == 0:
@@ -583,14 +593,15 @@ cdef class WeightedMedianCalculator:
 
         return_value = self.samples.pop(data, weight)
         self.update_median_parameters_post_remove(data[0],
-                                                  weight[0])
+                                                  weight[0],
+                                                  original_median)
         return return_value
 
     cdef int update_median_parameters_post_remove(self, DOUBLE_t data,
-                                                  DOUBLE_t weight) nogil:
+                                                  DOUBLE_t weight,
+                                                  double original_median) nogil:
         """Update the parameters used in the median calculation,
         namely `k` and `sum_w_0_k` after a removal"""
-        cdef double current_median
         # reset parameters because it there are no elements
         if self.samples.size() == 0:
             self.k = 0
@@ -606,10 +617,9 @@ cdef class WeightedMedianCalculator:
             return 0
 
         # get the current weighted median
-        current_median = self.get_median()
         self.total_weight -= weight
 
-        if data < current_median:
+        if data < original_median:
             # removing below the median, so decrement k and
             # then update self.sum_w_0_k accordingly by subtracting
             # the removed weight
@@ -627,7 +637,7 @@ cdef class WeightedMedianCalculator:
                 self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
             return 0
 
-        if data >= current_median:
+        if data >= original_median:
             # removing above the median
             # minimize k such that sum(W[0:k]) >= total_weight / 2
             while(self.k > 1 and ((self.sum_w_0_k -
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 0053155f8622f..78a35fe5becc1 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -1443,3 +1443,16 @@ def test_no_sparse_y_support():
     # Currently we don't support sparse y
     for name in ALL_TREES:
         yield (check_no_sparse_y_support, name)
+
+def test_mae():
+    # check MAE criterion produces correct results
+    # on small toy dataset
+    dt_mae = DecisionTreeRegressor(random_state=0, criterion="mae",
+                                   max_leaf_nodes=2)
+    dt_mae.fit([[3],[5],[3],[8],[5]],[6,7,3,4,3])
+    assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0/3.0])
+    assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
+
+    dt_mae.fit([[3],[5],[3],[8],[5]],[6,7,3,4,3], [0.6,0.3,0.1,1.0,0.3])
+    assert_array_equal(dt_mae.tree_.impurity, [7.0/2.3, 3.0/0.7, 4.0/1.6])
+    assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0])

From 6a1f3d4b9820449d9f3bce709c0cb376d865a5cf Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 20 Jul 2016 13:50:55 -0700
Subject: [PATCH 69/75] change all medians to DOUBLE_t

---
 sklearn/tree/_criterion.pyx | 9 ++++++---
 sklearn/tree/_utils.pxd     | 6 +++---
 sklearn/tree/_utils.pyx     | 8 ++++----
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 723810775f789..23a2c206f24ef 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1235,7 +1235,8 @@ cdef class MAE(RegressionCriterion):
 
                 y_ik = y[i * self.y_stride + k]
 
-                impurity_left[0] += <double>fabs((<double> y_ik) - median)
+                impurity_left[0] += <double>fabs((<double> y_ik) -
+                                                 <double> median)
         impurity_left[0] /= <double>((self.weighted_n_left) * self.n_outputs)
 
         for k in range(self.n_outputs):
@@ -1245,8 +1246,10 @@ cdef class MAE(RegressionCriterion):
 
                 y_ik = y[i * self.y_stride + k]
 
-                impurity_right[0] += <double>fabs((<double> y_ik) - median)
-        impurity_right[0] /= <double>((self.weighted_n_right) * self.n_outputs)
+                impurity_right[0] += <double>fabs((<double> y_ik) -
+                                                  <double> median)
+        impurity_right[0] /= <double>((self.weighted_n_right) *
+                                      self.n_outputs)
 
 
 cdef class FriedmanMSE(MSE):
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 25c09783c73ba..883f454514008 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -148,10 +148,10 @@ cdef class WeightedMedianCalculator:
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil
     cdef int update_median_parameters_post_push(self, DOUBLE_t data,
                                                 DOUBLE_t weight,
-                                                double original_median) nogil
+                                                DOUBLE_t original_median) nogil
     cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil
     cdef int pop(self, DOUBLE_t* data, DOUBLE_t* weight) nogil
     cdef int update_median_parameters_post_remove(self, DOUBLE_t data,
                                                   DOUBLE_t weight,
-                                                  double original_median) nogil
-    cdef double get_median(self) nogil
+                                                  DOUBLE_t original_median) nogil
+    cdef DOUBLE_t get_median(self) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 0a98718ab9d20..84c562d198afa 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -511,7 +511,7 @@ cdef class WeightedMedianCalculator:
         in the median calculation.
         """
         cdef int return_value
-        cdef double original_median
+        cdef DOUBLE_t original_median
 
         if self.size() != 0:
             original_median = self.get_median()
@@ -522,7 +522,7 @@ cdef class WeightedMedianCalculator:
 
     cdef int update_median_parameters_post_push(self, DOUBLE_t data,
                                                 DOUBLE_t weight,
-                                                double original_median) nogil:
+                                                DOUBLE_t original_median) nogil:
         """Update the parameters used in the median calculation,
         namely `k` and `sum_w_0_k` after an insertion"""
 
@@ -567,7 +567,7 @@ cdef class WeightedMedianCalculator:
         from consideration in the median calculation
         """
         cdef int return_value
-        cdef double original_median
+        cdef DOUBLE_t original_median
 
         if self.size() != 0:
             original_median = self.get_median()
@@ -647,7 +647,7 @@ cdef class WeightedMedianCalculator:
                 self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
             return 0
 
-    cdef double get_median(self) nogil:
+    cdef DOUBLE_t get_median(self) nogil:
         """Write the median to a pointer, taking into account
         sample weights."""
         if self.sum_w_0_k == (self.total_weight / 2.0):

From e25a52cff6964f83df946bdd5a577210851c141e Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 20 Jul 2016 16:43:34 -0700
Subject: [PATCH 70/75] add loginc allocate mediancalculators once, and reset
 otherwise

---
 sklearn/tree/_criterion.pyx | 37 +++++++++++++++++++++++++------------
 sklearn/tree/_utils.pxd     |  2 ++
 sklearn/tree/_utils.pyx     | 14 ++++++++++++++
 3 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 23a2c206f24ef..3ad0acda77bdc 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1018,6 +1018,15 @@ cdef class MAE(RegressionCriterion):
                    SIZE_t end) nogil:
         """Initialize the criterion at node samples[start:end] and
            children samples[start:start] and samples[start:end]."""
+        cdef SIZE_t i
+        cdef SIZE_t p
+        cdef SIZE_t k
+        cdef DOUBLE_t y_ik
+        cdef DOUBLE_t w = 1.0
+        cdef bint init_med_calculators
+
+        if self.n_node_samples == 0:
+            init_med_calculators = 0
 
         # Initialize fields
         self.y = y
@@ -1030,20 +1039,25 @@ cdef class MAE(RegressionCriterion):
         self.weighted_n_samples = weighted_n_samples
         self.weighted_n_node_samples = 0.
 
-        cdef SIZE_t i
-        cdef SIZE_t p
-        cdef SIZE_t k
-        cdef DOUBLE_t y_ik
-        cdef DOUBLE_t w = 1.0
+        cdef void** left_child
+        cdef void** right_child
 
-        with gil:
+        # initialize WeightedMedianCalculators
+        if init_med_calculators == 0:
+            with gil:
+                for k in range(self.n_outputs):
+                    self.left_child[k] = WeightedMedianCalculator(self.n_node_samples)
+                    self.right_child[k] = WeightedMedianCalculator(self.n_node_samples)
+        # already initialized, so reset WeightedMedianCalculators
+        else:
+            left_child = <void**> self.left_child.data
+            right_child = <void**> self.right_child.data
             for k in range(self.n_outputs):
-                self.left_child[k] = WeightedMedianCalculator(self.n_node_samples)
-                self.right_child[k] = WeightedMedianCalculator(self.n_node_samples)
-
-        cdef void** left_child = <void**> self.left_child.data
-        cdef void** right_child = <void**> self.right_child.data
+                (<WeightedMedianCalculator> left_child[k]).reset()
+                (<WeightedMedianCalculator> right_child[k]).reset()
 
+        left_child = <void**> self.left_child.data
+        right_child = <void**> self.right_child.data
         for p in range(start, end):
             i = samples[p]
 
@@ -1058,7 +1072,6 @@ cdef class MAE(RegressionCriterion):
                 (<WeightedMedianCalculator> right_child[k]).push(y_ik, w)
 
             self.weighted_n_node_samples += w
-
         # calculate the node medians
         for k in range(self.n_outputs):
             self.node_medians[k] = (<WeightedMedianCalculator> right_child[k]).get_median()
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 883f454514008..d11880908c318 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -123,6 +123,7 @@ cdef class WeightedPQueue:
     cdef WeightedPQueueRecord* array_
 
     cdef bint is_empty(self) nogil
+    cdef void reset(self) nogil
     cdef SIZE_t size(self) nogil
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil
     cdef int remove(self, DOUBLE_t data, DOUBLE_t weight) nogil
@@ -146,6 +147,7 @@ cdef class WeightedMedianCalculator:
 
     cdef SIZE_t size(self) nogil
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil
+    cdef void reset(self) nogil
     cdef int update_median_parameters_post_push(self, DOUBLE_t data,
                                                 DOUBLE_t weight,
                                                 DOUBLE_t original_median) nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 84c562d198afa..9377cfa616e16 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -13,6 +13,7 @@
 
 from libc.stdlib cimport free
 from libc.stdlib cimport malloc
+from libc.stdlib cimport calloc
 from libc.stdlib cimport realloc
 from libc.math cimport log as ln
 
@@ -337,6 +338,12 @@ cdef class WeightedPQueue:
     def __dealloc__(self):
         free(self.array_)
 
+    cdef void reset(self) nogil:
+        """Reset the WeightedPQueue to its state at construction"""
+        self.array_ptr = 0
+        self.array_ = <WeightedPQueueRecord*> calloc(self.capacity,
+                                                     sizeof(WeightedPQueueRecord))
+
     cdef bint is_empty(self) nogil:
         return self.array_ptr <= 0
 
@@ -505,6 +512,13 @@ cdef class WeightedMedianCalculator:
         WeightedMedianCalculator"""
         return self.samples.size()
 
+    cdef void reset(self) nogil:
+        """Reset the WeightedMedianCalculator to its state at construction"""
+        self.samples.reset()
+        self.total_weight = 0
+        self.k = 0
+        self.sum_w_0_k = 0
+
     cdef int push(self, DOUBLE_t data, DOUBLE_t weight) nogil:
         """Push a value and its associated weight
         to the WeightedMedianCalculator to be considered

From bd0c71dc8441d9f88f943807558a51408626beca Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Wed, 20 Jul 2016 20:38:36 -0700
Subject: [PATCH 71/75] misc style fixes

---
 sklearn/tree/_criterion.pyx | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 3ad0acda77bdc..47736ac31221c 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1018,9 +1018,8 @@ cdef class MAE(RegressionCriterion):
                    SIZE_t end) nogil:
         """Initialize the criterion at node samples[start:end] and
            children samples[start:start] and samples[start:end]."""
-        cdef SIZE_t i
-        cdef SIZE_t p
-        cdef SIZE_t k
+
+        cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
         cdef DOUBLE_t w = 1.0
         cdef bint init_med_calculators
@@ -1082,8 +1081,7 @@ cdef class MAE(RegressionCriterion):
     cdef void reset(self) nogil:
         """Reset the criterion at pos=start."""
 
-        cdef SIZE_t i
-        cdef SIZE_t k
+        cdef SIZE_t i, k
         cdef DOUBLE_t value
         cdef DOUBLE_t weight
 
@@ -1094,8 +1092,8 @@ cdef class MAE(RegressionCriterion):
         self.weighted_n_right = self.weighted_n_node_samples
         self.pos = self.start
 
-        # reset the medianheaps, left should have no elements and
-        # right should have all elements.
+        # reset the WeightedMedianCalculators, left should have no
+        # elements and right should have all elements.
 
         for k in range(self.n_outputs):
             # if left has no elements, it's already reset
@@ -1118,8 +1116,8 @@ cdef class MAE(RegressionCriterion):
         cdef void** left_child = <void**> self.left_child.data
         cdef void** right_child = <void**> self.right_child.data
 
-        # reverse_reset the medianheaps, right should have no elements and
-        # left should have all elements.
+        # reverse reset the WeightedMedianCalculators, right should have no
+        # elements and left should have all elements.
         for k in range(self.n_outputs):
             # if right has no elements, it's already reset
             for i in range((<WeightedMedianCalculator> right_child[k]).size()):
@@ -1141,9 +1139,7 @@ cdef class MAE(RegressionCriterion):
         cdef DOUBLE_t* y = self.y
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
-        cdef SIZE_t i
-        cdef SIZE_t p
-        cdef SIZE_t k
+        cdef SIZE_t i, p, k
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t y_ik
 
@@ -1223,6 +1219,7 @@ cdef class MAE(RegressionCriterion):
            left child (samples[start:pos]) and the impurity the right child
            (samples[pos:end]).
         """
+
         cdef DOUBLE_t* y = self.y
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples

From d3245ae9e842f6d0d48c1becab2f506877cc05e2 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 21 Jul 2016 08:35:50 -0700
Subject: [PATCH 72/75] modify cinit of regressioncriterion to take n_samples

---
 sklearn/tree/_criterion.pyx | 37 +++++++++++++++++--------------------
 sklearn/tree/tree.py        |  3 ++-
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 47736ac31221c..71b9085675eda 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -687,13 +687,16 @@ cdef class RegressionCriterion(Criterion):
 
     cdef double sq_sum_total
 
-    def __cinit__(self, SIZE_t n_outputs):
+    def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
         """Initialize parameters for this criterion.
 
         Parameters
         ----------
         n_outputs: SIZE_t
             The number of targets to be predicted
+
+        n_samples: SIZE_t
+            The total number of samples to fit on
         """
 
         # Default values
@@ -975,13 +978,16 @@ cdef class MAE(RegressionCriterion):
     cdef np.ndarray right_child
     cdef DOUBLE_t* node_medians
 
-    def __cinit__(self, SIZE_t n_outputs):
+    def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
         """Initialize parameters for this criterion.
 
         Parameters
         ----------
         n_outputs: SIZE_t
             The number of targets to be predicted
+
+        n_samples: SIZE_t
+            The total number of samples to fit on
         """
 
         # Default values
@@ -1012,6 +1018,10 @@ cdef class MAE(RegressionCriterion):
 
         self.left_child = np.empty(n_outputs, dtype='object')
         self.right_child = np.empty(n_outputs, dtype='object')
+        # initialize WeightedMedianCalculators
+        for k in range(n_outputs):
+            self.left_child[k] = WeightedMedianCalculator(n_samples)
+            self.right_child[k] = WeightedMedianCalculator(n_samples)
 
     cdef void init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
                    double weighted_n_samples, SIZE_t* samples, SIZE_t start,
@@ -1022,10 +1032,6 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t i, p, k
         cdef DOUBLE_t y_ik
         cdef DOUBLE_t w = 1.0
-        cdef bint init_med_calculators
-
-        if self.n_node_samples == 0:
-            init_med_calculators = 0
 
         # Initialize fields
         self.y = y
@@ -1041,22 +1047,13 @@ cdef class MAE(RegressionCriterion):
         cdef void** left_child
         cdef void** right_child
 
-        # initialize WeightedMedianCalculators
-        if init_med_calculators == 0:
-            with gil:
-                for k in range(self.n_outputs):
-                    self.left_child[k] = WeightedMedianCalculator(self.n_node_samples)
-                    self.right_child[k] = WeightedMedianCalculator(self.n_node_samples)
-        # already initialized, so reset WeightedMedianCalculators
-        else:
-            left_child = <void**> self.left_child.data
-            right_child = <void**> self.right_child.data
-            for k in range(self.n_outputs):
-                (<WeightedMedianCalculator> left_child[k]).reset()
-                (<WeightedMedianCalculator> right_child[k]).reset()
-
         left_child = <void**> self.left_child.data
         right_child = <void**> self.right_child.data
+
+        for k in range(self.n_outputs):
+            (<WeightedMedianCalculator> left_child[k]).reset()
+            (<WeightedMedianCalculator> right_child[k]).reset()
+
         for p in range(start, end):
             i = samples[p]
 
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index deca4c7730754..f004d845279bc 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -338,7 +338,8 @@ def fit(self, X, y, sample_weight=None, check_input=True,
                 criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                          self.n_classes_)
             else:
-                criterion = CRITERIA_REG[self.criterion](self.n_outputs_)
+                criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
+                                                         n_samples)
 
         SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
 

From dbaa57bf873ba0f680f67f802aeedd83247251f6 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 21 Jul 2016 09:25:22 -0700
Subject: [PATCH 73/75] add MAE formula and force rebuild bc. travis was down

---
 sklearn/tree/_criterion.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 71b9085675eda..7e2b6a3a80e9e 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -969,7 +969,10 @@ cdef class MSE(RegressionCriterion):
         impurity_right[0] /= self.n_outputs
 
 cdef class MAE(RegressionCriterion):
-    """Mean absolute error impurity criterion"""
+    """Mean absolute error impurity criterion
+
+       MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true
+       value and f_i is the predicted value."""
     def __dealloc__(self):
         """Destructor."""
         free(self.node_medians)

From f668ab9aca168759a761195db5e7ffe03ad34609 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 21 Jul 2016 13:12:47 -0700
Subject: [PATCH 74/75] add criterion parameter to gradient boosting and add
 forest tests

---
 sklearn/ensemble/gradient_boosting.py | 31 +++++++++++++++++++++------
 sklearn/ensemble/tests/test_forest.py |  4 ++--
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index b17d726cb122a..1b0767d419168 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -720,8 +720,8 @@ class BaseGradientBoosting(six.with_metaclass(ABCMeta, BaseEnsemble,
     """Abstract base class for Gradient Boosting. """
 
     @abstractmethod
-    def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
-                 min_samples_leaf, min_weight_fraction_leaf,
+    def __init__(self, loss, learning_rate, n_estimators, criterion,
+                 min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
                  max_depth, init, subsample, max_features,
                  random_state, alpha=0.9, verbose=0, max_leaf_nodes=None,
                  warm_start=False, presort='auto'):
@@ -729,6 +729,7 @@ def __init__(self, loss, learning_rate, n_estimators, min_samples_split,
         self.n_estimators = n_estimators
         self.learning_rate = learning_rate
         self.loss = loss
+        self.criterion = criterion
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
         self.min_weight_fraction_leaf = min_weight_fraction_leaf
@@ -762,7 +763,7 @@ def _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask,
 
             # induce regression tree on residuals
             tree = DecisionTreeRegressor(
-                criterion='friedman_mse',
+                criterion=self.criterion,
                 splitter='best',
                 max_depth=self.max_depth,
                 min_samples_split=self.min_samples_split,
@@ -1296,6 +1297,14 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
         of the input variables.
         Ignored if ``max_leaf_nodes`` is not None.
 
+    criterion : string, optional (default="friedman_mse")
+        The function to measure the quality of a split. Supported criteria
+        are "friedman_mse" for the mean squared error with improvement
+        score by Friedman, "mse" for mean squared error, and "mae" for
+        the mean absolute error. The default value of "friedman_mse" is
+        generally the best as it can provide a better approximation in
+        some cases.
+
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
@@ -1426,7 +1435,7 @@ class GradientBoostingClassifier(BaseGradientBoosting, ClassifierMixin):
     _SUPPORTED_LOSS = ('deviance', 'exponential')
 
     def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
-                 subsample=1.0, min_samples_split=2,
+                 subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, init=None, random_state=None,
                  max_features=None, verbose=0,
@@ -1435,7 +1444,7 @@ def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
 
         super(GradientBoostingClassifier, self).__init__(
             loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
-            min_samples_split=min_samples_split,
+            criterion=criterion, min_samples_split=min_samples_split,
             min_samples_leaf=min_samples_leaf,
             min_weight_fraction_leaf=min_weight_fraction_leaf,
             max_depth=max_depth, init=init, subsample=subsample,
@@ -1643,6 +1652,14 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
         of the input variables.
         Ignored if ``max_leaf_nodes`` is not None.
 
+    criterion : string, optional (default="friedman_mse")
+        The function to measure the quality of a split. Supported criteria
+        are "friedman_mse" for the mean squared error with improvement
+        score by Friedman, "mse" for mean squared error, and "mae" for
+        the mean absolute error. The default value of "friedman_mse" is
+        generally the best as it can provide a better approximation in
+        some cases.
+
     min_samples_split : int, float, optional (default=2)
         The minimum number of samples required to split an internal node:
 
@@ -1772,7 +1789,7 @@ class GradientBoostingRegressor(BaseGradientBoosting, RegressorMixin):
     _SUPPORTED_LOSS = ('ls', 'lad', 'huber', 'quantile')
 
     def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
-                 subsample=1.0, min_samples_split=2,
+                 subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, init=None, random_state=None,
                  max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
@@ -1780,7 +1797,7 @@ def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
 
         super(GradientBoostingRegressor, self).__init__(
             loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
-            min_samples_split=min_samples_split,
+            criterion=criterion, min_samples_split=min_samples_split,
             min_samples_leaf=min_samples_leaf,
             min_weight_fraction_leaf=min_weight_fraction_leaf,
             max_depth=max_depth, init=init, subsample=subsample,
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index ce3642c5cfe21..489ba40689d38 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -159,7 +159,7 @@ def check_boston_criterion(name, criterion):
 
 
 def test_boston():
-    for name, criterion in product(FOREST_REGRESSORS, ("mse", )):
+    for name, criterion in product(FOREST_REGRESSORS, ("mse", "mae", "friedman_mse")):
         yield check_boston_criterion, name, criterion
 
 
@@ -244,7 +244,7 @@ def test_importances():
     for name, criterion in product(FOREST_CLASSIFIERS, ["gini", "entropy"]):
         yield check_importances, name, criterion, X, y
 
-    for name, criterion in product(FOREST_REGRESSORS, ["mse", "friedman_mse"]):
+    for name, criterion in product(FOREST_REGRESSORS, ["mse", "friedman_mse", "mae"]):
         yield check_importances, name, criterion, X, y
 
 

From 04d3b8add4f407101a2add2621d55f09dc6c5e14 Mon Sep 17 00:00:00 2001
From: Nelson Liu <nelson.liu.2009@gmail.com>
Date: Thu, 21 Jul 2016 13:51:30 -0700
Subject: [PATCH 75/75] add entries to what's new

---
 doc/whats_new.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 3a3ddf932a828..dd339bcabb6da 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -117,6 +117,14 @@ New features
      and Harabaz score to evaluate the resulting clustering of a set of points.
      By `Arnaud Fouchet`_ and `Thierry Guillemot`_.
 
+   - Added a new splitting criterion for :class:`tree.DecisionTreeRegressor`,
+     the mean absolute error. This criterion can also be used in
+     :class:`ensemble.ExtraTreesRegressor`,
+     :class:`ensemble.RandomForestRegressor`, and the gradient boosting
+     estimators. (`#6667
+     <https://github.com/scikit-learn/scikit-learn/pull/6667>`_) by `Nelson
+     Liu`_.
+
 Enhancements
 ............
 
@@ -146,6 +154,11 @@ Enhancements
      provided as a percentage of the training samples. By
      `yelite`_ and `Arnaud Joly`_.
 
+   - Gradient boosting estimators accept the parameter ``criterion`` to specify
+     to splitting criterion used in built decision trees. (`#6667
+     <https://github.com/scikit-learn/scikit-learn/pull/6667>`_) by `Nelson
+     Liu`_.
+
    - Codebase does not contain C/C++ cython generated files: they are
      generated during build. Distribution packages will still contain generated
      C/C++ files. By `Arthur Mensch`_.
@@ -4280,3 +4293,5 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson.
 .. _Sebastian Säger: https://github.com/ssaeger
 
 .. _YenChen Lin: https://github.com/yenchenlin
+
+.. _Nelson Liu: https://github.com/nelson-liu