8000 - Add SmartSplitter: use local response proportion for categorical fe… · scikit-learn/scikit-learn@cd6c7bd · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit cd6c7bd

Browse files
author
Lilian Weng
committed
- Add SmartSplitter: use local response proportion for categorical feature split in decision tree.
- Use bitmap for tracking categorical split - Update tree.export.export_graphviz()
1 parent fc3bec7 commit cd6c7bd

File tree

10 files changed

+502
-43
lines changed

10 files changed

+502
-43
lines changed

sklearn/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
3838
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
3939
#
40-
__version__ = '0.19.dev0'
40+
__version__ = '0.19.dev1'
4141

4242

4343
try:

sklearn/tree/_criterion.pyx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,7 @@ cdef class MSE(RegressionCriterion):
969969
impurity_left[0] /= self.n_outputs
970970
impurity_right[0] /= self.n_outputs
971971

972+
972973
cdef class MAE(RegressionCriterion):
973974
"""Mean absolute error impurity criterion
974975

sklearn/tree/_splitter.pxd

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight
1919
ctypedef np.npy_intp SIZE_t # Type for indices and counters
2020
ctypedef np.npy_int32 INT32_t # Signed 32 bit integer
2121
ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer
22+
ctypedef np.npy_uint64 UINT64_t # Unsigned 64 bit integer
2223

2324
cdef struct SplitRecord:
2425
# Data to track sample split
@@ -30,6 +31,9 @@ cdef struct SplitRecord:
3031
double improvement # Impurity improvement given parent node.
3132
double impurity_left # Impurity of the left split.
3233
double impurity_right # Impurity of the right split.
34+
SIZE_t n_categories # Num. of categories of the feature; -1 if not categorical.
35+
UINT64_t split_map # bitmap guiding how to split; 1 means right node.
36+
3337

3438
cdef class Splitter:
3539
# The splitter searches in the input space for a feature and a threshold
@@ -83,7 +87,8 @@ cdef class Splitter:
8387
# Methods
8488
cdef void init(self, object X, np.ndarray y,
8589
DOUBLE_t* sample_weight,
86-
np.ndarray X_idx_sorted=*) except *
90+
np.ndarray X_idx_sorted=*,
91+
np.ndarray categorical_features=*) except *
8792

8893
cdef void node_reset(self, SIZE_t start, SIZE_t end,
8994
double* weighted_n_node_samples) nogil

0 commit comments

Comments
 (0)
0