8000 - Add SmartSplitter: use local response proportion for categorical fe… · scikit-learn/scikit-learn@cd6c7bd · GitHub
[go: up one dir, main page]

Skip to content

Commit cd6c7bd

Browse files
author
Lilian Weng
committed
- Add SmartSplitter: use local response proportion for categorical feature split in decision tree.
- Use bitmap for tracking categorical split - Update tree.export.export_graphviz()
1 parent fc3bec7 commit cd6c7bd

File tree

10 files changed

+502
-43
lines changed

10 files changed

+502
-43
lines changed

sklearn/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
3838
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
3939
#
40-
__version__ = '0.19.dev0'
40+
__version__ = '0.19.dev1'
4141

4242

4343
try:

sklearn/tree/_criterion.pyx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,7 @@ cdef class MSE(RegressionCriterion):
969969
impurity_left[0] /= self.n_outputs
970970
impurity_right[0] /= self.n_outputs
971971

972+
972973
cdef class MAE(RegressionCriterion):
973974
"""Mean absolute error impurity criterion
974975

sklearn/tree/_splitter.pxd

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight
1919
ctypedef np.npy_intp SIZE_t # Type for indices and counters
2020
ctypedef np.npy_int32 INT32_t # Signed 32 bit integer
2121
ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer
22+
ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer
2223

2324
cdef struct SplitRecord:
2425
# Data to track sample split
@@ -30,6 +31,9 @@ cdef struct SplitRecord:
3031
double improvement # Impurity improvement given parent node.
3132
double impurity_left # Impurity of the left split.
3233
double impurity_right # Impurity of the right split.
34+
SIZE_t n_categories # Num. of categories of the feature; -1 if not categorical.
35+
UINT64_t split_map # bitmap guiding how to split; 1 means right node.
36+
3337

3438
cdef class Splitter:
3539
# The splitter searches in the input space for a feature and a threshold
@@ -83,7 +87,8 @@ cdef class Splitter:
8387
# Methods
8488
cdef void init(self, object X, np.ndarray y,
8589
DOUBLE_t* sample_weight,
86-
np.ndarray X_idx_sorted=*) except *
90+
np.ndarray X_idx_sorted=*,
91+
np.ndarray categorical_features=*) except *
8792

8893
cdef void node_reset(self, SIZE_t start, SIZE_t end,
8994
double* weighted_n_node_samples) nogil

0 commit comments

Comments
 (0)
0