diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 4e80774bc3110..d1db93e9ab1b5 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -37,7 +37,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.19.dev0' +__version__ = '0.19.dev1' try: diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 26c40dc8d6616..0d30ef9b9f945 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -969,6 +969,7 @@ cdef class MSE(RegressionCriterion): impurity_left[0] /= self.n_outputs impurity_right[0] /= self.n_outputs + cdef class MAE(RegressionCriterion): """Mean absolute error impurity criterion diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 4b97d01614b9f..36bc7f7c8d4b1 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -19,6 +19,7 @@ ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef np.npy_intp SIZE_t # Type for indices and counters ctypedef np.npy_int32 INT32_t # Signed 32 bit integer ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer +ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer cdef struct SplitRecord: # Data to track sample split @@ -30,6 +31,9 @@ cdef struct SplitRecord: double improvement # Impurity improvement given parent node. double impurity_left # Impurity of the left split. double impurity_right # Impurity of the right split. + SIZE_t n_categories # Num. of categories of the feature; -1 if not categorical. + UINT64_t split_map # bitmap guiding how to split; 1 means right node. + cdef class Splitter: # The splitter searches in the input space for a feature and a threshold @@ -83,7 +87,8 @@ cdef class Splitter: # Methods cdef void init(self, object X, np.ndarray y, DOUBLE_t* sample_weight, - np.ndarray X_idx_sorted=*) except * + np.ndarray X_idx_sorted=*, + np.ndarray categorical_features=*) except * cdef void node_reset(self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples) nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 5fa7ee553fe2d..3d3fcb36860fe 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -28,6 +28,7 @@ np.import_array() from scipy.sparse import csc_matrix +from ._utils cimport goes_right from ._utils cimport log from ._utils cimport rand_int from ._utils cimport rand_uniform @@ -43,13 +44,21 @@ cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 # in SparseSplitter cdef DTYPE_t EXTRACT_NNZ_SWITCH = 0.1 -cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil: +cdef SIZE_t MAX_CATEGORICAL_LABEL = 64 + +cdef DOUBLE_t Y_MEAN_UNDEFINED = -1 +cdef SIZE_t CARDINALITY_UNDEFINED = -2 + +cdef inline void _init_split(SplitRecord*self, SIZE_t start_pos) nogil: self.impurity_left = INFINITY self.impurity_right = INFINITY self.pos = start_pos self.feature = 0 self.threshold = 0. self.improvement = -INFINITY + self.n_categories = CARDINALITY_UNDEFINED + self.split_map = 0 + cdef class Splitter: """Abstract splitter class. @@ -120,7 +129,8 @@ cdef class Splitter: object X, np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, DOUBLE_t* sample_weight, - np.ndarray X_idx_sorted=None) except *: + np.ndarray X_idx_sorted=None, + np.ndarray categorical_features=None) except *: """Initialize the splitter. Take in the input data X, the target Y, and optional sample weights. @@ -137,8 +147,14 @@ cdef class Splitter: The weights of the samples, where higher weighted samples are fit closer than lower weight samples. If not provided, all samples are assumed to have uniform weight. - """ + X_idx_sorted : np.ndarray + Only the ``BaseDenseSplitter`` subclass handles this parameter. + + categorical_features : np.ndarray + Only the ``BaseDenseSplitter`` subclass handles this parameter. + + """ self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef SIZE_t n_samples = X.shape[0] @@ -179,8 +195,6 @@ cdef class Splitter: self.y = y.data self.y_stride = y.strides[0] / y.itemsize - self.sample_weight = sample_weight - cdef void node_reset(self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples) nogil: """Reset splitter on node samples[start:end]. @@ -240,6 +254,13 @@ cdef class BaseDenseSplitter(Splitter): cdef SIZE_t n_total_samples cdef SIZE_t* sample_mask + cdef np.ndarray categorical_features + cdef SIZE_t* categorical_flags + + cdef DOUBLE_t* feature_y_sum + cdef DOUBLE_t* feature_y_count + cdef DOUBLE_t* feature_y_mean + def __cinit__(self, Criterion criterion, SIZE_t max_features, SIZE_t min_samples_leaf, double min_weight_leaf, object random_state, bint presort): @@ -252,16 +273,29 @@ cdef class BaseDenseSplitter(Splitter): self.sample_mask = NULL self.presort = presort + self.categorical_flags = NULL + + self.feature_y_sum = NULL + self.feature_y_count = NULL + self.feature_y_mean = NULL + def __dealloc__(self): """Destructor.""" if self.presort == 1: free(self.sample_mask) + free(self.categorical_flags) + + free(self.feature_y_sum) + free(self.feature_y_count) + free(self.feature_y_mean) + cdef void init(self, object X, np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, DOUBLE_t* sample_weight, - np.ndarray X_idx_sorted=None) except *: + np.ndarray X_idx_sorted=None, + np.ndarray categorical_features=None) except *: """Initialize the splitter.""" # Call parent init @@ -282,7 +316,26 @@ cdef class BaseDenseSplitter(Splitter): self.n_total_samples = X.shape[0] safe_realloc(&self.sample_mask, self.n_total_samples) - memset(self.sample_mask, 0, self.n_total_samples*sizeof(SIZE_t)) + memset(self.sample_mask, 0, self.n_total_samples * sizeof(SIZE_t)) + + safe_realloc(&self.categorical_flags, self.n_features) + memset(self.categorical_flags, 0, self.n_features * sizeof(SIZE_t)) + + cdef SIZE_t i + for i in range(self.n_features): + self.categorical_flags[i] = 0 + + if categorical_features is not None: + for i in range(categorical_features.size): + self.categorical_flags[ categorical_features[i]] = 1 + + safe_realloc(&self.feature_y_sum, MAX_CATEGORICAL_LABEL) + safe_realloc(&self.feature_y_count, MAX_CATEGORICAL_LABEL) + safe_realloc(&self.feature_y_mean, MAX_CATEGORICAL_LABEL) + + memset(self.feature_y_sum, 0, MAX_CATEGORICAL_LABEL * sizeof(DOUBLE_t)) + memset(self.feature_y_count, 0, MAX_CATEGORICAL_LABEL * sizeof(DOUBLE_t)) + memset(self.feature_y_mean, 0, MAX_CATEGORICAL_LABEL * sizeof(DOUBLE_t)) cdef class BestSplitter(BaseDenseSplitter): @@ -404,7 +457,7 @@ cdef class BestSplitter(BaseDenseSplitter): p = start feature_idx_offset = self.X_idx_sorted_stride * current.feature - for i in range(self.n_total_samples): + for i in range(self.n_total_samples): j = X_idx_sorted[i + feature_idx_offset] if sample_mask[j] == 1: samples[p] = j @@ -511,6 +564,308 @@ cdef class BestSplitter(BaseDenseSplitter): n_constant_features[0] = n_total_constants +cdef class SmartSplitter(BaseDenseSplitter): + """Splitter that can handle categorical features in a smart way.""" + def __reduce__(self): + return (SmartSplitter, (self.criterion, + self.max_features, + self.min_samples_leaf, + self.min_weight_leaf, + self.random_state, + self.presort), self.__getstate__()) + + cdef void node_split(self, double impurity, SplitRecord* split, + SIZE_t* n_constant_features) nogil: + """Find the best split on node samples[start:end].""" + # Find the best split + cdef SIZE_t* samples = self.samples + cdef SIZE_t start = self.start + cdef SIZE_t end = self.end + + cdef SIZE_t* features = self.features + cdef SIZE_t* constant_features = self.constant_features + cdef SIZE_t n_features = self.n_features + + cdef DTYPE_t* X = self.X + cdef DTYPE_t* Xf = self.feature_values + cdef SIZE_t X_sample_stride = self.X_sample_stride + cdef SIZE_t X_feature_stride = self.X_feature_stride + cdef SIZE_t max_features = self.max_features + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef double min_weight_leaf = self.min_weight_leaf + cdef UINT32_t* random_state = &self.rand_r_state + + cdef INT32_t* X_idx_sorted = self.X_idx_sorted_ptr + cdef SIZE_t* sample_mask = self.sample_mask + + cdef SplitRecord best, current + cdef double current_proxy_improvement = -INFINITY + cdef double best_proxy_improvement = -INFINITY + + cdef SIZE_t f_i = n_features + cdef SIZE_t f_j + cdef SIZE_t tmp + cdef SIZE_t p + cdef SIZE_t feature_idx_offset + cdef SIZE_t feature_offset + cdef SIZE_t i + cdef SIZE_t j + + cdef SIZE_t n_visited_features = 0 + # Number of features discovered to be constant during the split search + cdef SIZE_t n_found_constants = 0 + # Number of features known to be constant and drawn without replacement + cdef SIZE_t n_drawn_constants = 0 + cdef SIZE_t n_known_constants = n_constant_features[0] + # n_total_constants = n_known_constants + n_found_constants + cdef SIZE_t n_total_constants = n_known_constants + cdef DTYPE_t current_feature_value + cdef SIZE_t partition_end + + # For categorical features + cdef UINT64_t split_map = 0 + cdef SIZE_t* categorical_flags = self.categorical_flags + cdef DTYPE_t sample_value + cdef SIZE_t sample_label + cdef SIZE_t sample_max_label = 0 + cdef DOUBLE_t* feature_y_sum = self.feature_y_sum + cdef DOUBLE_t* feature_y_count = self.feature_y_count + cdef DOUBLE_t* feature_y_mean = self.feature_y_mean + + _init_split(&best, end) + + if self.presort == 1: + for p in range(start, end): + sample_mask[samples[p]] = 1 + + # Sample up to max_features without replacement using a + # Fisher-Yates-based algorithm (using the local variables `f_i` and + # `f_j` to compute a permutation of the `features` array). + # + # Skip the CPU intensive evaluation of the impurity criterion for + # features that were already detected as constant (hence not suitable + # for good splitting) by ancestor nodes and save the information on + # newly discovered constant features to spare computation on descendant + # nodes. + while (f_i > n_total_constants and # Stop early if remaining features + # are constant + (n_visited_features < max_features or + # At least one drawn features must be non constant + n_visited_features <= n_found_constants + n_drawn_constants)): + + n_visited_features += 1 + + # Loop invariant: elements of features in + # - [:n_drawn_constant[ holds drawn and known constant features; + # - [n_drawn_constant:n_known_constant[ holds known constant + # features that haven't been drawn yet; + # - [n_known_constant:n_total_constant[ holds newly found constant + # features; + # - [n_total_constant:f_i[ holds features that haven't been drawn + # yet and aren't constant apriori. + # - [f_i:n_features[ holds features that have been drawn + # and aren't constant. + + # Draw a feature at random + f_j = rand_int(n_drawn_constants, f_i - n_found_constants, + random_state) + + sample_max_label = 0 + + if f_j < n_known_constants: + # f_j in the interval [n_drawn_constants, n_known_constants[ + tmp = features[f_j] + features[f_j] = features[n_drawn_constants] + features[n_drawn_constants] = tmp + + n_drawn_constants += 1 + + else: + # f_j in the interval [n_known_constants, f_i - n_found_constants[ + f_j += n_found_constants + # f_j in the interval [n_total_constants, f_i[ + current.feature = features[f_j] + current.n_categories = CARDINALITY_UNDEFINED + feature_offset = self.X_feature_stride * current.feature + is_categorical = categorical_flags[current.feature] + + ################################################################################ + if is_categorical: + # Here we have a categorical feature; treat it very differently. + for i in range(start, end): + Xf[i] = X[self.X_sample_stride * samples[i] + feature_offset] + if Xf[i] > sample_max_label: + sample_max_label = Xf[i] + + current.n_categories = sample_max_label + 1 + + # Compute the mean of y for each categorical label. + # TODO: Figure out a way to handle corner cases when we cannot + # observe all the categorical values in the sub-nodes. + for i in range(sample_max_label + 1): + # Initialize, `i` is the categorical label. + feature_y_sum[i] = 0.0 + feature_y_count[i] = 0.0 + feature_y_mean[i] = Y_MEAN_UNDEFINED + + for p in range(start, end): + feature_y_sum[ Xf[p]] += self.y[samples[p] * self.y_stride] + feature_y_count[ Xf[p]] += 1 + + for i in range(sample_max_label + 1): + if feature_y_count[i] > 0: + feature_y_mean[i] = feature_y_sum[i] / feature_y_count[i] + + # First sort Xf and samples by the categorical labels + sort(Xf + start, samples + start, end - start) + + # Update Xf with the transformed value + for p in range(start, end): + Xf[p] = feature_y_mean[ Xf[p]] + + # Then sort Xf and samples by the mean y associated with each label. + sort(Xf + start, samples + start, end - start) + + # TODO: Add a new array variable to avoid assigning the values again. + for i in range(start, end): + Xf[i] = X[self.X_sample_stride * samples[i] + feature_offset] + + ############################################################################ + else: + # Here we have a continuous feature. + # Sort samples along that feature; either by utilizing + # presorting, or by copying the values into an array and + # sorting the array in a manner which utilizes the cache more + # effectively. + if self.presort == 1: + p = start + feature_idx_offset = self.X_idx_sorted_stride * current.feature + + for i in range(self.n_total_samples): + j = X_idx_sorted[i + feature_idx_offset] + if sample_mask[j] == 1: + samples[p] = j + Xf[p] = X[self.X_sample_stride * j + feature_offset] + p += 1 + else: + for i in range(start, end): + Xf[i] = X[self.X_sample_stride * samples[i] + feature_offset] + + sort(Xf + start, samples + start, end - start) + + ############################################################################ + + if ((not is_categorical and Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD) or + (is_categorical and Xf[end - 1] == Xf[start])): + # This is a constant feature. + features[f_j] = features[n_total_constants] + features[n_total_constants] = current.feature + + n_found_constants += 1 + n_total_constants += 1 + + else: + # This is not a constant feature. + f_i -= 1 + features[f_i], features[f_j] = features[f_j], features[f_i] + + # Evaluate all splits + self.criterion.reset() + p = start + + while p < end: + while (p + 1 < end and ( + (not is_categorical and Xf[p + 1] <= Xf[p] + FEATURE_THRESHOLD ) or + (is_categorical and Xf[p + 1] == Xf[p]) + )): + p += 1 + + # (p + 1 >= end) or (X[samples[p + 1], current.feature] != + # X[samples[p], current.feature]) + p += 1 + # (p >= end) or (X[samples[p], current.feature] != + # X[samples[p - 1], current.feature]) + + if p < end: + current.pos = p + + # Reject if min_samples_leaf is not guaranteed + if (((current.pos - start) < min_samples_leaf) or + ((end - current.pos) < min_samples_leaf)): + continue + + self.criterion.update(current.pos) + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + continue + + current_proxy_improvement = self.criterion.proxy_impurity_improvement() + + if current_proxy_improvement > best_proxy_improvement: + best_proxy_improvement = current_proxy_improvement + current.threshold = (Xf[p - 1] + Xf[p]) / 2.0 + + if current.threshold == Xf[p]: + current.threshold = Xf[p - 1] + + if is_categorical: + split_map = 0 + for i in range(current.pos, end): + # 1 in the bitmap refers to the right node. + split_map |= ( 1) << ( Xf[i]) + else: + split_map = 0 + + best = current # copy + best.split_map = split_map + + # Reorganize into samples[start:best.pos] + samples[best.pos:end] + if best.pos < end: + feature_offset = X_feature_stride * best.feature + partition_end = end + p = start + + while p < partition_end: + sample_value = X[X_sample_stride * samples[p] + feature_offset] + + if goes_right(sample_value, best.threshold, best.n_categories, + best.split_map): + partition_end -= 1 + + tmp = samples[partition_end] + samples[partition_end] = samples[p] + samples[p] = tmp + else: + p += 1 + + self.criterion.reset() + self.criterion.update(best.pos) + best.improvement = self.criterion.impurity_improvement(impurity) + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) + + # Reset sample mask + if self.presort == 1: + for p in range(start, end): + sample_mask[samples[p]] = 0 + + # Respect invariant for constant features: the original order of + # element in features[:n_known_constants] must be preserved for sibling + # and child nodes + memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants) + + # Copy newly found constant features + memcpy(constant_features + n_known_constants, + features + n_known_constants, + sizeof(SIZE_t) * n_found_constants) + + # Return values + split[0] = best + n_constant_features[0] = n_total_constants + + # Sort n-element arrays pointed to by Xf and samples, simultaneously, # by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997). cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil: @@ -869,7 +1224,8 @@ cdef class BaseSparseSplitter(Splitter): object X, np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, DOUBLE_t* sample_weight, - np.ndarray X_idx_sorted=None) except *: + np.ndarray X_idx_sorted=None, + np.ndarray categorical_features=None) except *: """Initialize the splitter.""" # Call parent init diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index dbf0545b1e1d5..ac6e7bbce2050 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -18,13 +18,13 @@ ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef np.npy_intp SIZE_t # Type for indices and counters ctypedef np.npy_int32 INT32_t # Signed 32 bit integer ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer +ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer from ._splitter cimport Splitter from ._splitter cimport SplitRecord cdef struct Node: # Base storage structure for the nodes in a Tree object - SIZE_t left_child # id of the left child of the node SIZE_t right_child # id of the right child of the node SIZE_t feature # Feature used for splitting the node @@ -32,6 +32,8 @@ cdef struct Node: DOUBLE_t impurity # Impurity of the node (i.e., the value of the criterion) SIZE_t n_node_samples # Number of samples at the node DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node + SIZE_t n_categories # Num. of categories of the feature; -1 if not categorical. + UINT64_t split_map # bitmap guiding how to split; 1 means right node. cdef class Tree: @@ -58,7 +60,9 @@ cdef class Tree: cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, SIZE_t feature, double threshold, double impurity, SIZE_t n_node_samples, - double weighted_n_samples) nogil + double weighted_n_samples, + SIZE_t n_categories, + UINT64_t split_map) nogil cdef void _resize(self, SIZE_t capacity) except * cdef int _resize_c(self, SIZE_t capacity=*) nogil @@ -100,5 +104,6 @@ cdef class TreeBuilder: cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=*, - np.ndarray X_idx_sorted=*) + np.ndarray X_idx_sorted=*, + np.ndarray categorical_features=*) cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index f8632ab1640d8..b4286aee641d8 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -31,6 +31,7 @@ from scipy.sparse import issparse from scipy.sparse import csc_matrix from scipy.sparse import csr_matrix +from ._utils cimport goes_right from ._utils cimport Stack from ._utils cimport StackRecord from ._utils cimport PriorityHeap @@ -65,12 +66,13 @@ cdef SIZE_t _TREE_LEAF = TREE_LEAF cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED cdef SIZE_t INITIAL_STACK_SIZE = 10 -# Repeat struct definition for numpy +# Repeat struct definition of `Node` for numpy. NODE_DTYPE = np.dtype({ - 'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', - 'n_node_samples', 'weighted_n_node_samples'], + 'names': ['left_child', 'right_child', 'feature', 'threshold', + 'impurity', 'n_node_samples', 'weighted_n_node_samples', + 'n_categories', 'split_map'], 'formats': [np.intp, np.intp, np.intp, np.float64, np.float64, np.intp, - np.float64], + np.float64, np.intp, np.uint64], 'offsets': [ &( NULL).left_child, &( NULL).right_child, @@ -78,10 +80,13 @@ NODE_DTYPE = np.dtype({ &( NULL).threshold, &( NULL).impurity, &( NULL).n_node_samples, - &( NULL).weighted_n_node_samples + &( NULL).weighted_n_node_samples, + &( NULL).n_categories, + &( NULL).split_map, ] }) + # ============================================================================= # TreeBuilder # ============================================================================= @@ -91,7 +96,8 @@ cdef class TreeBuilder: cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, - np.ndarray X_idx_sorted=None): + np.ndarray X_idx_sorted=None, + np.ndarray categorical_features=None): """Build a decision tree from the training set (X, y).""" pass @@ -141,7 +147,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, - np.ndarray X_idx_sorted=None): + np.ndarray X_idx_sorted=None, + np.ndarray categorical_features=None): """Build a decision tree from the training set (X, y).""" # check input @@ -170,7 +177,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef double min_impurity_split = self.min_impurity_split # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight_ptr, X_idx_sorted) + splitter.init(X, y, sample_weight_ptr, X_idx_sorted, categorical_features) cdef SIZE_t start cdef SIZE_t end @@ -234,7 +241,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): node_id = tree._add_node(parent, is_left, is_leaf, split.feature, split.threshold, impurity, n_node_samples, - weighted_n_node_samples) + weighted_n_node_samples, + split.n_categories, + split.split_map) if node_id == (-1): rc = -1 @@ -302,7 +311,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cpdef build(self, Tree tree, object X, np.ndarray y, np.ndarray sample_weight=None, - np.ndarray X_idx_sorted=None): + np.ndarray X_idx_sorted=None, + np.ndarray categorical_features=None): """Build a decision tree from the training set (X, y).""" # check input @@ -320,7 +330,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef SIZE_t min_samples_split = self.min_samples_split # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight_ptr, X_idx_sorted) + splitter.init(X, y, sample_weight_ptr, X_idx_sorted, categorical_features) cdef PriorityHeap frontier = PriorityHeap(INITIAL_STACK_SIZE) cdef PriorityHeapRecord record @@ -362,6 +372,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node.right_child = _TREE_LEAF node.feature = _TREE_UNDEFINED node.threshold = _TREE_UNDEFINED + node.n_categories = _TREE_UNDEFINED + node.split_map = 0 else: # Node is expandable @@ -451,7 +463,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): else _TREE_UNDEFINED, is_left, is_leaf, split.feature, split.threshold, impurity, n_node_samples, - weighted_n_node_samples) + weighted_n_node_samples, + split.n_categories, + split.split_map) if node_id == (-1): return -1 @@ -577,10 +591,19 @@ cdef class Tree: def __get__(self): return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + property n_categories: + def __get__(self): + return self._get_node_ndarray()['n_categories'][:self.node_count] + + property split_maps: + def __get__(self): + return self._get_node_ndarray()['split_map'][:self.node_count] + property value: def __get__(self): return self._get_value_ndarray()[:self.node_count] + def __cinit__(self, int n_features, np.ndarray[SIZE_t, ndim=1] n_classes, int n_outputs): """Constructor.""" @@ -625,6 +648,7 @@ cdef class Tree: d["node_count"] = self.node_count d["nodes"] = self._get_node_ndarray() d["values"] = self._get_value_ndarray() + return d def __setstate__(self, d): @@ -641,6 +665,7 @@ cdef class Tree: value_shape = (node_ndarray.shape[0], self.n_outputs, self.max_n_classes) + if (node_ndarray.ndim != 1 or node_ndarray.dtype != NODE_DTYPE or not node_ndarray.flags.c_contiguous or @@ -702,7 +727,8 @@ cdef class Tree: cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, double weighted_n_node_samples) nogil: + SIZE_t n_node_samples, double weighted_n_node_samples, + SIZE_t n_categories, UINT64_t split_map) nogil: """Add a node to the tree. The new node registers itself as the child of its parent. @@ -719,6 +745,8 @@ cdef class Tree: node.impurity = impurity node.n_node_samples = n_node_samples node.weighted_n_node_samples = weighted_n_node_samples + node.n_categories = n_categories + node.split_map = split_map if parent != _TREE_UNDEFINED: if is_left: @@ -731,6 +759,8 @@ cdef class Tree: node.right_child = _TREE_LEAF node.feature = _TREE_UNDEFINED node.threshold = _TREE_UNDEFINED + node.n_categories = _TREE_UNDEFINED + node.split_map = 0 else: # left_child and right_child will be set later @@ -781,6 +811,8 @@ cdef class Tree: # Initialize auxiliary data-structure cdef Node* node = NULL cdef SIZE_t i = 0 + cdef SIZE_t j + cdef DOUBLE_t sample_value with nogil: for i in range(n_samples): @@ -788,11 +820,13 @@ cdef class Tree: # While node not a leaf while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - if X_ptr[X_sample_stride * i + - X_fx_stride * node.feature] <= node.threshold: - node = &self.nodes[node.left_child] - else: + sample_value = X_ptr[X_sample_stride * i + X_fx_stride * node.feature] + + if goes_right(sample_value, node.threshold, node.n_categories, + node.split_map): node = &self.nodes[node.right_child] + else: + node = &self.nodes[node.left_child] out_ptr[i] = (node - self.nodes) # node offset diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index fce3abcb734db..6bbb3f150a675 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -17,6 +17,7 @@ ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef np.npy_intp SIZE_t # Type for indices and counters ctypedef np.npy_int32 INT32_t # Signed 32 bit integer ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer +ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer cdef enum: # Max value for our rand_r replacement (near the bottom). @@ -48,15 +49,19 @@ cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size) cdef SIZE_t rand_int(SIZE_t low, SIZE_t high, - UINT32_t* random_state) nogil + UINT32_t* random_state) nogil cdef double rand_uniform(double low, double high, - UINT32_t* random_state) nogil + UINT32_t* random_state) nogil cdef double log(double x) nogil + +cdef bint goes_right(DTYPE_t sample_value, double threshold, + SIZE_t n_categories, UINT64_t split_map) nogil + # ============================================================================= # Stack data structure # ============================================================================= diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index a4ccc71946bd1..78be1ec245db3 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -85,6 +85,16 @@ cdef inline double log(double x) nogil: return ln(x) / ln(2.0) +cdef inline bint goes_right(DTYPE_t sample_value, double threshold, + SIZE_t n_categories, UINT64_t split_map) nogil: + """True if the sample value should go to right node.""" + if n_categories > 0: + # Categorical feature; 1 in the bitmap refers to right node. + return (split_map >> ( sample_value)) & 1 + else: + # Continuous feature. + return sample_value > threshold + # ============================================================================= # Stack data structure # ============================================================================= diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py index 43e8aa11b9611..eb1d509d32f1e 100644 --- a/sklearn/tree/export.py +++ b/sklearn/tree/export.py @@ -71,7 +71,8 @@ def export_graphviz(decision_tree, out_file=SENTINEL, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, leaves_parallel=False, impurity=True, node_ids=False, proportion=False, rotate=False, - rounded=False, special_characters=False): + rounded=False, special_characters=False, + split_map=True): """Export a decision tree in DOT format. This function generates a GraphViz representation of the decision tree, @@ -212,6 +213,16 @@ def node_to_str(tree, node_id, criterion): node_string += 'node ' node_string += characters[0] + str(node_id) + characters[4] + # Write categorical split map + if split_map: + if labels: + if tree.n_categories[node_id] > 0: + # Categorical feature + _n_categories = tree.n_categories[node_id] + _split_map = str(bin(tree.split_maps[node_id]))[2:] + node_string += "0" * (_n_categories - len(_split_map)) + \ + _split_map + characters[4] + # Write decision criteria if tree.children_left[node_id] != _tree.TREE_LEAF: # Always write node decision criteria, except for leaves @@ -221,10 +232,15 @@ def node_to_str(tree, node_id, criterion): feature = "X%s%s%s" % (characters[1], tree.feature[node_id], characters[2]) - node_string += '%s %s %s%s' % (feature, - characters[3], - round(tree.threshold[node_id], 4), - characters[4]) + + if tree.n_categories[node_id] > 0: + # Categorical feature + node_string += '%s%s' % (feature, characters[4]) + else: + node_string += '%s %s %s%s' % (feature, + characters[3], + round(tree.threshold[node_id], 4), + characters[4]) # Write impurity if impurity: diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index c3567e864c10b..faacbc797ff4a 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -61,11 +61,14 @@ "mae": _criterion.MAE} DENSE_SPLITTERS = {"best": _splitter.BestSplitter, - "random": _splitter.RandomSplitter} + "random": _splitter.RandomSplitter, + "smart": _splitter.SmartSplitter} SPARSE_SPLITTERS = {"best": _splitter.BestSparseSplitter, "random": _splitter.RandomSparseSplitter} +MAX_CATEGORICAL_LABEL = 64 + # ============================================================================= # Base decision tree # ============================================================================= @@ -115,7 +118,7 @@ def __init__(self, self.max_features_ = None def fit(self, X, y, sample_weight=None, check_input=True, - X_idx_sorted=None): + X_idx_sorted=None, categorical_features=None): random_state = check_random_state(self.random_state) if check_input: @@ -310,6 +313,25 @@ def fit(self, X, y, sample_weight=None, check_input=True, ".shape = {})".format(X.shape, X_idx_sorted.shape)) + if categorical_features: + if max(categorical_features) >= self.n_features_: + raise ValueError("Categorical feature indices are out of the " + "range; there are only {} features.".format( + self.n_features_)) + + for f_idx in categorical_features: + uniq_values = set(map(int, X[:, f_idx])) + + if max(uniq_values) + 1 > MAX_CATEGORICAL_LABEL: + raise ValueError("The cardinality ({}) of feature {} is too " + "high > {}.".format(max(uniq_values) + 1, f_idx, + MAX_CATEGORICAL_LABEL)) + + if set(uniq_values) != set(range(max(uniq_values) + 1)): + raise ValueError("Categorical features should go through" + "LabelEncoder first.") + + # # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): @@ -324,6 +346,8 @@ def fit(self, X, y, sample_weight=None, check_input=True, splitter = self.splitter if not isinstance(self.splitter, Splitter): + if self.splitter not in SPLITTERS: + raise ValueError("Unknown splitter '%s' in this use case." % self.splitter) splitter = SPLITTERS[self.splitter](criterion, self.max_features_, min_samples_leaf, @@ -347,7 +371,10 @@ def fit(self, X, y, sample_weight=None, check_input=True, max_leaf_nodes, self.min_impurity_split) - builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) + if categorical_features is not None: + categorical_features = np.asfortranarray(categorical_features) + + builder.build(self.tree_, X, y, sample_weight, X_idx_sorted, categorical_features) if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] @@ -696,7 +723,7 @@ def __init__(self, presort=presort) def fit(self, X, y, sample_weight=None, check_input=True, - X_idx_sorted=None): + X_idx_sorted=None, categorical_features=None): """Build a decision tree classifier from the training set (X, y). Parameters @@ -726,6 +753,9 @@ def fit(self, X, y, sample_weight=None, check_input=True, cached between trees. If None, the data will be sorted here. Don't use this parameter unless you know what to do. + categorical_features : array, optional + A list of indices of categorical features. + Returns ------- self : object @@ -736,7 +766,8 @@ def fit(self, X, y, sample_weight=None, check_input=True, X, y, sample_weight=sample_weight, check_input=check_input, - X_idx_sorted=X_idx_sorted) + X_idx_sorted=X_idx_sorted, + categorical_features=categorical_features) return self @@ -834,8 +865,11 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin): splitter : string, optional (default="best") The strategy used to choose the split at each node. Supported - strategies are "best" to choose the best split and "random" to choose - the best random split. + strategies are + - "best" to choose the best split; + - "random" to choose the best random split; + - "smart" to choose the best split with smart treatment of + categorical features beased on local response proportion. max_features : int, float, string or None, optional (default=None) The number of features to consider when looking for the best split: @@ -987,7 +1021,7 @@ def __init__(self, presort=presort) def fit(self, X, y, sample_weight=None, check_input=True, - X_idx_sorted=None): + X_idx_sorted=None, categorical_features=None): """Build a decision tree regressor from the training set (X, y). Parameters @@ -1016,6 +1050,9 @@ def fit(self, X, y, sample_weight=None, check_input=True, cached between trees. If None, the data will be sorted here. Don't use this parameter unless you know what to do. + categorical_features : array, optional + A list of indices of categorical features. + Returns ------- self : object @@ -1026,7 +1063,8 @@ def fit(self, X, y, sample_weight=None, check_input=True, X, y, sample_weight=sample_weight, check_input=check_input, - X_idx_sorted=X_idx_sorted) + X_idx_sorted=X_idx_sorted, + categorical_features=categorical_features) return self