diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 4e80774bc3110..d1db93e9ab1b5 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -37,7 +37,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.19.dev0'
+__version__ = '0.19.dev1'
 
 
 try:
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 26c40dc8d6616..0d30ef9b9f945 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -969,6 +969,7 @@ cdef class MSE(RegressionCriterion):
         impurity_left[0] /= self.n_outputs
         impurity_right[0] /= self.n_outputs
 
+
 cdef class MAE(RegressionCriterion):
     """Mean absolute error impurity criterion
 
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 4b97d01614b9f..36bc7f7c8d4b1 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -19,6 +19,7 @@ ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
 ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
+ctypedef np.npy_uint64 UINT64_t          # Unsigned 64 bit integer
 
 cdef struct SplitRecord:
     # Data to track sample split
@@ -30,6 +31,9 @@ cdef struct SplitRecord:
     double improvement     # Impurity improvement given parent node.
     double impurity_left   # Impurity of the left split.
     double impurity_right  # Impurity of the right split.
+    SIZE_t n_categories    # Num. of categories of the feature; -1 if not categorical.
+    UINT64_t split_map     # bitmap guiding how to split; 1 means right node.
+
 
 cdef class Splitter:
     # The splitter searches in the input space for a feature and a threshold
@@ -83,7 +87,8 @@ cdef class Splitter:
     # Methods
     cdef void init(self, object X, np.ndarray y,
                    DOUBLE_t* sample_weight,
-                   np.ndarray X_idx_sorted=*) except *
+                   np.ndarray X_idx_sorted=*,
+                   np.ndarray categorical_features=*) except *
 
     cdef void node_reset(self, SIZE_t start, SIZE_t end,
                          double* weighted_n_node_samples) nogil
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 5fa7ee553fe2d..3d3fcb36860fe 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -28,6 +28,7 @@ np.import_array()
 
 from scipy.sparse import csc_matrix
 
+from ._utils cimport goes_right
 from ._utils cimport log
 from ._utils cimport rand_int
 from ._utils cimport rand_uniform
@@ -43,13 +44,21 @@ cdef DTYPE_t FEATURE_THRESHOLD = 1e-7
 # in SparseSplitter
 cdef DTYPE_t EXTRACT_NNZ_SWITCH = 0.1
 
-cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil:
+cdef SIZE_t MAX_CATEGORICAL_LABEL = 64
+
+cdef DOUBLE_t Y_MEAN_UNDEFINED = -1
+cdef SIZE_t CARDINALITY_UNDEFINED = -2
+
+cdef inline void _init_split(SplitRecord*self, SIZE_t start_pos) nogil:
     self.impurity_left = INFINITY
     self.impurity_right = INFINITY
     self.pos = start_pos
     self.feature = 0
     self.threshold = 0.
     self.improvement = -INFINITY
+    self.n_categories = CARDINALITY_UNDEFINED
+    self.split_map = 0
+
 
 cdef class Splitter:
     """Abstract splitter class.
@@ -120,7 +129,8 @@ cdef class Splitter:
                    object X,
                    np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
                    DOUBLE_t* sample_weight,
-                   np.ndarray X_idx_sorted=None) except *:
+                   np.ndarray X_idx_sorted=None,
+                   np.ndarray categorical_features=None) except *:
         """Initialize the splitter.
 
         Take in the input data X, the target Y, and optional sample weights.
@@ -137,8 +147,14 @@ cdef class Splitter:
             The weights of the samples, where higher weighted samples are fit
             closer than lower weight samples. If not provided, all samples
             are assumed to have uniform weight.
-        """
 
+        X_idx_sorted : np.ndarray
+            Only the ``BaseDenseSplitter`` subclass handles this parameter.
+
+        categorical_features : np.ndarray
+            Only the ``BaseDenseSplitter`` subclass handles this parameter.
+
+        """
         self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
         cdef SIZE_t n_samples = X.shape[0]
 
@@ -179,8 +195,6 @@ cdef class Splitter:
         self.y = <DOUBLE_t*> y.data
         self.y_stride = <SIZE_t> y.strides[0] / <SIZE_t> y.itemsize
 
-        self.sample_weight = sample_weight
-
     cdef void node_reset(self, SIZE_t start, SIZE_t end,
                          double* weighted_n_node_samples) nogil:
         """Reset splitter on node samples[start:end].
@@ -240,6 +254,13 @@ cdef class BaseDenseSplitter(Splitter):
     cdef SIZE_t n_total_samples
     cdef SIZE_t* sample_mask
 
+    cdef np.ndarray categorical_features
+    cdef SIZE_t* categorical_flags
+
+    cdef DOUBLE_t* feature_y_sum
+    cdef DOUBLE_t* feature_y_count
+    cdef DOUBLE_t* feature_y_mean
+
     def __cinit__(self, Criterion criterion, SIZE_t max_features,
                   SIZE_t min_samples_leaf, double min_weight_leaf,
                   object random_state, bint presort):
@@ -252,16 +273,29 @@ cdef class BaseDenseSplitter(Splitter):
         self.sample_mask = NULL
         self.presort = presort
 
+        self.categorical_flags = NULL
+
+        self.feature_y_sum = NULL
+        self.feature_y_count = NULL
+        self.feature_y_mean = NULL
+
     def __dealloc__(self):
         """Destructor."""
         if self.presort == 1:
             free(self.sample_mask)
 
+        free(self.categorical_flags)
+
+        free(self.feature_y_sum)
+        free(self.feature_y_count)
+        free(self.feature_y_mean)
+
     cdef void init(self,
                    object X,
                    np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
                    DOUBLE_t* sample_weight,
-                   np.ndarray X_idx_sorted=None) except *:
+                   np.ndarray X_idx_sorted=None,
+                   np.ndarray categorical_features=None) except *:
         """Initialize the splitter."""
 
         # Call parent init
@@ -282,7 +316,26 @@ cdef class BaseDenseSplitter(Splitter):
 
             self.n_total_samples = X.shape[0]
             safe_realloc(&self.sample_mask, self.n_total_samples)
-            memset(self.sample_mask, 0, self.n_total_samples*sizeof(SIZE_t))
+            memset(self.sample_mask, 0, self.n_total_samples * sizeof(SIZE_t))
+
+        safe_realloc(&self.categorical_flags, self.n_features)
+        memset(self.categorical_flags, 0, self.n_features * sizeof(SIZE_t))
+
+        cdef SIZE_t i
+        for i in range(self.n_features):
+            self.categorical_flags[i] = 0
+
+        if categorical_features is not None:
+            for i in range(categorical_features.size):
+                self.categorical_flags[<SIZE_t> categorical_features[i]] = 1
+
+        safe_realloc(&self.feature_y_sum, MAX_CATEGORICAL_LABEL)
+        safe_realloc(&self.feature_y_count, MAX_CATEGORICAL_LABEL)
+        safe_realloc(&self.feature_y_mean, MAX_CATEGORICAL_LABEL)
+
+        memset(self.feature_y_sum, 0, MAX_CATEGORICAL_LABEL * sizeof(DOUBLE_t))
+        memset(self.feature_y_count, 0, MAX_CATEGORICAL_LABEL * sizeof(DOUBLE_t))
+        memset(self.feature_y_mean, 0, MAX_CATEGORICAL_LABEL * sizeof(DOUBLE_t))
 
 
 cdef class BestSplitter(BaseDenseSplitter):
@@ -404,7 +457,7 @@ cdef class BestSplitter(BaseDenseSplitter):
                     p = start
                     feature_idx_offset = self.X_idx_sorted_stride * current.feature
 
-                    for i in range(self.n_total_samples): 
+                    for i in range(self.n_total_samples):
                         j = X_idx_sorted[i + feature_idx_offset]
                         if sample_mask[j] == 1:
                             samples[p] = j
@@ -511,6 +564,308 @@ cdef class BestSplitter(BaseDenseSplitter):
         n_constant_features[0] = n_total_constants
 
 
+cdef class SmartSplitter(BaseDenseSplitter):
+    """Splitter that can handle categorical features in a smart way."""
+    def __reduce__(self):
+        return (SmartSplitter, (self.criterion,
+                               self.max_features,
+                               self.min_samples_leaf,
+                               self.min_weight_leaf,
+                               self.random_state,
+                               self.presort), self.__getstate__())
+
+    cdef void node_split(self, double impurity, SplitRecord* split,
+                         SIZE_t* n_constant_features) nogil:
+        """Find the best split on node samples[start:end]."""
+        # Find the best split
+        cdef SIZE_t* samples = self.samples
+        cdef SIZE_t start = self.start
+        cdef SIZE_t end = self.end
+
+        cdef SIZE_t* features = self.features
+        cdef SIZE_t* constant_features = self.constant_features
+        cdef SIZE_t n_features = self.n_features
+
+        cdef DTYPE_t* X = self.X
+        cdef DTYPE_t* Xf = self.feature_values
+        cdef SIZE_t X_sample_stride = self.X_sample_stride
+        cdef SIZE_t X_feature_stride = self.X_feature_stride
+        cdef SIZE_t max_features = self.max_features
+        cdef SIZE_t min_samples_leaf = self.min_samples_leaf
+        cdef double min_weight_leaf = self.min_weight_leaf
+        cdef UINT32_t* random_state = &self.rand_r_state
+
+        cdef INT32_t* X_idx_sorted = self.X_idx_sorted_ptr
+        cdef SIZE_t* sample_mask = self.sample_mask
+
+        cdef SplitRecord best, current
+        cdef double current_proxy_improvement = -INFINITY
+        cdef double best_proxy_improvement = -INFINITY
+
+        cdef SIZE_t f_i = n_features
+        cdef SIZE_t f_j
+        cdef SIZE_t tmp
+        cdef SIZE_t p
+        cdef SIZE_t feature_idx_offset
+        cdef SIZE_t feature_offset
+        cdef SIZE_t i
+        cdef SIZE_t j
+
+        cdef SIZE_t n_visited_features = 0
+        # Number of features discovered to be constant during the split search
+        cdef SIZE_t n_found_constants = 0
+        # Number of features known to be constant and drawn without replacement
+        cdef SIZE_t n_drawn_constants = 0
+        cdef SIZE_t n_known_constants = n_constant_features[0]
+        # n_total_constants = n_known_constants + n_found_constants
+        cdef SIZE_t n_total_constants = n_known_constants
+        cdef DTYPE_t current_feature_value
+        cdef SIZE_t partition_end
+
+        # For categorical features
+        cdef UINT64_t split_map = 0
+        cdef SIZE_t* categorical_flags = self.categorical_flags
+        cdef DTYPE_t sample_value
+        cdef SIZE_t sample_label
+        cdef SIZE_t sample_max_label = 0
+        cdef DOUBLE_t* feature_y_sum = self.feature_y_sum
+        cdef DOUBLE_t* feature_y_count = self.feature_y_count
+        cdef DOUBLE_t* feature_y_mean = self.feature_y_mean
+
+        _init_split(&best, end)
+
+        if self.presort == 1:
+            for p in range(start, end):
+                sample_mask[samples[p]] = 1
+
+        # Sample up to max_features without replacement using a
+        # Fisher-Yates-based algorithm (using the local variables `f_i` and
+        # `f_j` to compute a permutation of the `features` array).
+        #
+        # Skip the CPU intensive evaluation of the impurity criterion for
+        # features that were already detected as constant (hence not suitable
+        # for good splitting) by ancestor nodes and save the information on
+        # newly discovered constant features to spare computation on descendant
+        # nodes.
+        while (f_i > n_total_constants and  # Stop early if remaining features
+                                            # are constant
+                (n_visited_features < max_features or
+                 # At least one drawn features must be non constant
+                 n_visited_features <= n_found_constants + n_drawn_constants)):
+
+            n_visited_features += 1
+
+            # Loop invariant: elements of features in
+            # - [:n_drawn_constant[ holds drawn and known constant features;
+            # - [n_drawn_constant:n_known_constant[ holds known constant
+            #   features that haven't been drawn yet;
+            # - [n_known_constant:n_total_constant[ holds newly found constant
+            #   features;
+            # - [n_total_constant:f_i[ holds features that haven't been drawn
+            #   yet and aren't constant apriori.
+            # - [f_i:n_features[ holds features that have been drawn
+            #   and aren't constant.
+
+            # Draw a feature at random
+            f_j = rand_int(n_drawn_constants, f_i - n_found_constants,
+                           random_state)
+
+            sample_max_label = 0
+
+            if f_j < n_known_constants:
+                # f_j in the interval [n_drawn_constants, n_known_constants[
+                tmp = features[f_j]
+                features[f_j] = features[n_drawn_constants]
+                features[n_drawn_constants] = tmp
+
+                n_drawn_constants += 1
+
+            else:
+                # f_j in the interval [n_known_constants, f_i - n_found_constants[
+                f_j += n_found_constants
+                # f_j in the interval [n_total_constants, f_i[
+                current.feature = features[f_j]
+                current.n_categories = CARDINALITY_UNDEFINED
+                feature_offset = self.X_feature_stride * current.feature
+                is_categorical = categorical_flags[current.feature]
+
+                ################################################################################
+                if is_categorical:
+                    # Here we have a categorical feature; treat it very differently.
+                    for i in range(start, end):
+                        Xf[i] = X[self.X_sample_stride * samples[i] + feature_offset]
+                        if <SIZE_t> Xf[i] > sample_max_label:
+                            sample_max_label = <SIZE_t> Xf[i]
+
+                    current.n_categories = sample_max_label + 1
+
+                    # Compute the mean of y for each categorical label.
+                    # TODO: Figure out a way to handle corner cases when we cannot
+                    # observe all the categorical values in the sub-nodes.
+                    for i in range(sample_max_label + 1):
+                        # Initialize, `i` is the categorical label.
+                        feature_y_sum[i] = 0.0
+                        feature_y_count[i] = 0.0
+                        feature_y_mean[i] = Y_MEAN_UNDEFINED
+
+                    for p in range(start, end):
+                        feature_y_sum[<SIZE_t> Xf[p]] += self.y[samples[p] * self.y_stride]
+                        feature_y_count[<SIZE_t> Xf[p]] += 1
+
+                    for i in range(sample_max_label + 1):
+                        if feature_y_count[i] > 0:
+                            feature_y_mean[i] = feature_y_sum[i] / feature_y_count[i]
+
+                    # First sort Xf and samples by the categorical labels
+                    sort(Xf + start, samples + start, end - start)
+
+                    # Update Xf with the transformed value
+                    for p in range(start, end):
+                        Xf[p] = feature_y_mean[<SIZE_t> Xf[p]]
+
+                    # Then sort Xf and samples by the mean y associated with each label.
+                    sort(Xf + start, samples + start, end - start)
+
+                    # TODO: Add a new array variable to avoid assigning the values again.
+                    for i in range(start, end):
+                        Xf[i] = X[self.X_sample_stride * samples[i] + feature_offset]
+
+                ############################################################################
+                else:
+                    # Here we have a continuous feature.
+                    # Sort samples along that feature; either by utilizing
+                    # presorting, or by copying the values into an array and
+                    # sorting the array in a manner which utilizes the cache more
+                    # effectively.
+                    if self.presort == 1:
+                        p = start
+                        feature_idx_offset = self.X_idx_sorted_stride * current.feature
+
+                        for i in range(self.n_total_samples):
+                            j = X_idx_sorted[i + feature_idx_offset]
+                            if sample_mask[j] == 1:
+                                samples[p] = j
+                                Xf[p] = X[self.X_sample_stride * j + feature_offset]
+                                p += 1
+                    else:
+                        for i in range(start, end):
+                            Xf[i] = X[self.X_sample_stride * samples[i] + feature_offset]
+
+                        sort(Xf + start, samples + start, end - start)
+
+                ############################################################################
+
+                if ((not is_categorical and Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD) or
+                    (is_categorical and Xf[end - 1] == Xf[start])):
+                    # This is a constant feature.
+                    features[f_j] = features[n_total_constants]
+                    features[n_total_constants] = current.feature
+
+                    n_found_constants += 1
+                    n_total_constants += 1
+
+                else:
+                    # This is not a constant feature.
+                    f_i -= 1
+                    features[f_i], features[f_j] = features[f_j], features[f_i]
+
+                    # Evaluate all splits
+                    self.criterion.reset()
+                    p = start
+
+                    while p < end:
+                        while (p + 1 < end and (
+                            (not is_categorical and Xf[p + 1] <= Xf[p] + FEATURE_THRESHOLD ) or
+                            (is_categorical and Xf[p + 1] == Xf[p])
+                        )):
+                            p += 1
+
+                        # (p + 1 >= end) or (X[samples[p + 1], current.feature] !=
+                        #                    X[samples[p], current.feature])
+                        p += 1
+                        # (p >= end) or (X[samples[p], current.feature] !=
+                        #                X[samples[p - 1], current.feature])
+
+                        if p < end:
+                            current.pos = p
+
+                            # Reject if min_samples_leaf is not guaranteed
+                            if (((current.pos - start) < min_samples_leaf) or
+                                ((end - current.pos) < min_samples_leaf)):
+                                continue
+
+                            self.criterion.update(current.pos)
+
+                            # Reject if min_weight_leaf is not satisfied
+                            if ((self.criterion.weighted_n_left < min_weight_leaf) or
+                                (self.criterion.weighted_n_right < min_weight_leaf)):
+                                continue
+
+                            current_proxy_improvement = self.criterion.proxy_impurity_improvement()
+
+                            if current_proxy_improvement > best_proxy_improvement:
+                                best_proxy_improvement = current_proxy_improvement
+                                current.threshold = (Xf[p - 1] + Xf[p]) / 2.0
+
+                                if current.threshold == Xf[p]:
+                                    current.threshold = Xf[p - 1]
+
+                                if is_categorical:
+                                    split_map = <UINT64_t> 0
+                                    for i in range(current.pos, end):
+                                        # 1 in the bitmap refers to the right node.
+                                        split_map |= (<UINT64_t> 1) << (<UINT64_t> Xf[i])
+                                else:
+                                    split_map = <UINT64_t> 0
+
+                                best = current  # copy
+                                best.split_map = split_map
+
+        # Reorganize into samples[start:best.pos] + samples[best.pos:end]
+        if best.pos < end:
+            feature_offset = X_feature_stride * best.feature
+            partition_end = end
+            p = start
+
+            while p < partition_end:
+                sample_value = X[X_sample_stride * samples[p] + feature_offset]
+
+                if goes_right(sample_value, best.threshold, best.n_categories,
+                              best.split_map):
+                    partition_end -= 1
+
+                    tmp = samples[partition_end]
+                    samples[partition_end] = samples[p]
+                    samples[p] = tmp
+                else:
+                    p += 1
+
+            self.criterion.reset()
+            self.criterion.update(best.pos)
+            best.improvement = self.criterion.impurity_improvement(impurity)
+            self.criterion.children_impurity(&best.impurity_left,
+                                             &best.impurity_right)
+
+        # Reset sample mask
+        if self.presort == 1:
+            for p in range(start, end):
+                sample_mask[samples[p]] = 0
+
+        # Respect invariant for constant features: the original order of
+        # element in features[:n_known_constants] must be preserved for sibling
+        # and child nodes
+        memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants)
+
+        # Copy newly found constant features
+        memcpy(constant_features + n_known_constants,
+               features + n_known_constants,
+               sizeof(SIZE_t) * n_found_constants)
+
+        # Return values
+        split[0] = best
+        n_constant_features[0] = n_total_constants
+
+
 # Sort n-element arrays pointed to by Xf and samples, simultaneously,
 # by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997).
 cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
@@ -869,7 +1224,8 @@ cdef class BaseSparseSplitter(Splitter):
                    object X,
                    np.ndarray[DOUBLE_t, ndim=2, mode="c"] y,
                    DOUBLE_t* sample_weight,
-                   np.ndarray X_idx_sorted=None) except *:
+                   np.ndarray X_idx_sorted=None,
+                   np.ndarray categorical_features=None) except *:
         """Initialize the splitter."""
 
         # Call parent init
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index dbf0545b1e1d5..ac6e7bbce2050 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -18,13 +18,13 @@ ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
 ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
+ctypedef np.npy_uint64 UINT64_t          # Unsigned 64 bit integer
 
 from ._splitter cimport Splitter
 from ._splitter cimport SplitRecord
 
 cdef struct Node:
     # Base storage structure for the nodes in a Tree object
-
     SIZE_t left_child                    # id of the left child of the node
     SIZE_t right_child                   # id of the right child of the node
     SIZE_t feature                       # Feature used for splitting the node
@@ -32,6 +32,8 @@ cdef struct Node:
     DOUBLE_t impurity                    # Impurity of the node (i.e., the value of the criterion)
     SIZE_t n_node_samples                # Number of samples at the node
     DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
+    SIZE_t n_categories                  # Num. of categories of the feature; -1 if not categorical.
+    UINT64_t split_map                   # bitmap guiding how to split; 1 means right node.
 
 
 cdef class Tree:
@@ -58,7 +60,9 @@ cdef class Tree:
     cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
                           SIZE_t feature, double threshold, double impurity,
                           SIZE_t n_node_samples,
-                          double weighted_n_samples) nogil
+                          double weighted_n_samples,
+                          SIZE_t n_categories,
+                          UINT64_t split_map) nogil
     cdef void _resize(self, SIZE_t capacity) except *
     cdef int _resize_c(self, SIZE_t capacity=*) nogil
 
@@ -100,5 +104,6 @@ cdef class TreeBuilder:
 
     cpdef build(self, Tree tree, object X, np.ndarray y,
                 np.ndarray sample_weight=*,
-                np.ndarray X_idx_sorted=*)
+                np.ndarray X_idx_sorted=*,
+                np.ndarray categorical_features=*)
     cdef _check_input(self, object X, np.ndarray y, np.ndarray sample_weight)
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index f8632ab1640d8..b4286aee641d8 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -31,6 +31,7 @@ from scipy.sparse import issparse
 from scipy.sparse import csc_matrix
 from scipy.sparse import csr_matrix
 
+from ._utils cimport goes_right
 from ._utils cimport Stack
 from ._utils cimport StackRecord
 from ._utils cimport PriorityHeap
@@ -65,12 +66,13 @@ cdef SIZE_t _TREE_LEAF = TREE_LEAF
 cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED
 cdef SIZE_t INITIAL_STACK_SIZE = 10
 
-# Repeat struct definition for numpy
+# Repeat struct definition of `Node` for numpy.
 NODE_DTYPE = np.dtype({
-    'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity',
-              'n_node_samples', 'weighted_n_node_samples'],
+    'names': ['left_child', 'right_child', 'feature', 'threshold',
+              'impurity', 'n_node_samples', 'weighted_n_node_samples',
+              'n_categories', 'split_map'],
     'formats': [np.intp, np.intp, np.intp, np.float64, np.float64, np.intp,
-                np.float64],
+                np.float64, np.intp, np.uint64],
     'offsets': [
         <Py_ssize_t> &(<Node*> NULL).left_child,
         <Py_ssize_t> &(<Node*> NULL).right_child,
@@ -78,10 +80,13 @@ NODE_DTYPE = np.dtype({
         <Py_ssize_t> &(<Node*> NULL).threshold,
         <Py_ssize_t> &(<Node*> NULL).impurity,
         <Py_ssize_t> &(<Node*> NULL).n_node_samples,
-        <Py_ssize_t> &(<Node*> NULL).weighted_n_node_samples
+        <Py_ssize_t> &(<Node*> NULL).weighted_n_node_samples,
+        <Py_ssize_t> &(<Node*> NULL).n_categories,
+        <Py_ssize_t> &(<Node*> NULL).split_map,
     ]
 })
 
+
 # =============================================================================
 # TreeBuilder
 # =============================================================================
@@ -91,7 +96,8 @@ cdef class TreeBuilder:
 
     cpdef build(self, Tree tree, object X, np.ndarray y,
                 np.ndarray sample_weight=None,
-                np.ndarray X_idx_sorted=None):
+                np.ndarray X_idx_sorted=None,
+                np.ndarray categorical_features=None):
         """Build a decision tree from the training set (X, y)."""
         pass
 
@@ -141,7 +147,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
     cpdef build(self, Tree tree, object X, np.ndarray y,
                 np.ndarray sample_weight=None,
-                np.ndarray X_idx_sorted=None):
+                np.ndarray X_idx_sorted=None,
+                np.ndarray categorical_features=None):
         """Build a decision tree from the training set (X, y)."""
 
         # check input
@@ -170,7 +177,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef double min_impurity_split = self.min_impurity_split
 
         # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight_ptr, X_idx_sorted)
+        splitter.init(X, y, sample_weight_ptr, X_idx_sorted, categorical_features)
 
         cdef SIZE_t start
         cdef SIZE_t end
@@ -234,7 +241,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
                 node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
                                          split.threshold, impurity, n_node_samples,
-                                         weighted_n_node_samples)
+                                         weighted_n_node_samples,
+                                         split.n_categories,
+                                         split.split_map)
 
                 if node_id == <SIZE_t>(-1):
                     rc = -1
@@ -302,7 +311,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 
     cpdef build(self, Tree tree, object X, np.ndarray y,
                 np.ndarray sample_weight=None,
-                np.ndarray X_idx_sorted=None):
+                np.ndarray X_idx_sorted=None,
+                np.ndarray categorical_features=None):
         """Build a decision tree from the training set (X, y)."""
 
         # check input
@@ -320,7 +330,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef SIZE_t min_samples_split = self.min_samples_split
 
         # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight_ptr, X_idx_sorted)
+        splitter.init(X, y, sample_weight_ptr, X_idx_sorted, categorical_features)
 
         cdef PriorityHeap frontier = PriorityHeap(INITIAL_STACK_SIZE)
         cdef PriorityHeapRecord record
@@ -362,6 +372,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                     node.right_child = _TREE_LEAF
                     node.feature = _TREE_UNDEFINED
                     node.threshold = _TREE_UNDEFINED
+                    node.n_categories = _TREE_UNDEFINED
+                    node.split_map = 0
 
                 else:
                     # Node is expandable
@@ -451,7 +463,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                                  else _TREE_UNDEFINED,
                                  is_left, is_leaf,
                                  split.feature, split.threshold, impurity, n_node_samples,
-                                 weighted_n_node_samples)
+                                 weighted_n_node_samples,
+                                 split.n_categories,
+                                 split.split_map)
         if node_id == <SIZE_t>(-1):
             return -1
 
@@ -577,10 +591,19 @@ cdef class Tree:
         def __get__(self):
             return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
 
+    property n_categories:
+        def __get__(self):
+            return self._get_node_ndarray()['n_categories'][:self.node_count]
+
+    property split_maps:
+        def __get__(self):
+            return self._get_node_ndarray()['split_map'][:self.node_count]
+
     property value:
         def __get__(self):
             return self._get_value_ndarray()[:self.node_count]
 
+
     def __cinit__(self, int n_features, np.ndarray[SIZE_t, ndim=1] n_classes,
                   int n_outputs):
         """Constructor."""
@@ -625,6 +648,7 @@ cdef class Tree:
         d["node_count"] = self.node_count
         d["nodes"] = self._get_node_ndarray()
         d["values"] = self._get_value_ndarray()
+
         return d
 
     def __setstate__(self, d):
@@ -641,6 +665,7 @@ cdef class Tree:
 
         value_shape = (node_ndarray.shape[0], self.n_outputs,
                        self.max_n_classes)
+
         if (node_ndarray.ndim != 1 or
                 node_ndarray.dtype != NODE_DTYPE or
                 not node_ndarray.flags.c_contiguous or
@@ -702,7 +727,8 @@ cdef class Tree:
 
     cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
                           SIZE_t feature, double threshold, double impurity,
-                          SIZE_t n_node_samples, double weighted_n_node_samples) nogil:
+                          SIZE_t n_node_samples, double weighted_n_node_samples,
+                          SIZE_t n_categories, UINT64_t split_map) nogil:
         """Add a node to the tree.
 
         The new node registers itself as the child of its parent.
@@ -719,6 +745,8 @@ cdef class Tree:
         node.impurity = impurity
         node.n_node_samples = n_node_samples
         node.weighted_n_node_samples = weighted_n_node_samples
+        node.n_categories = n_categories
+        node.split_map = split_map
 
         if parent != _TREE_UNDEFINED:
             if is_left:
@@ -731,6 +759,8 @@ cdef class Tree:
             node.right_child = _TREE_LEAF
             node.feature = _TREE_UNDEFINED
             node.threshold = _TREE_UNDEFINED
+            node.n_categories = _TREE_UNDEFINED
+            node.split_map = 0
 
         else:
             # left_child and right_child will be set later
@@ -781,6 +811,8 @@ cdef class Tree:
         # Initialize auxiliary data-structure
         cdef Node* node = NULL
         cdef SIZE_t i = 0
+        cdef SIZE_t j
+        cdef DOUBLE_t sample_value
 
         with nogil:
             for i in range(n_samples):
@@ -788,11 +820,13 @@ cdef class Tree:
                 # While node not a leaf
                 while node.left_child != _TREE_LEAF:
                     # ... and node.right_child != _TREE_LEAF:
-                    if X_ptr[X_sample_stride * i +
-                             X_fx_stride * node.feature] <= node.threshold:
-                        node = &self.nodes[node.left_child]
-                    else:
+                    sample_value = X_ptr[X_sample_stride * i + X_fx_stride * node.feature]
+
+                    if goes_right(sample_value, node.threshold, node.n_categories,
+                                  node.split_map):
                         node = &self.nodes[node.right_child]
+                    else:
+                        node = &self.nodes[node.left_child]
 
                 out_ptr[i] = <SIZE_t>(node - self.nodes)  # node offset
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index fce3abcb734db..6bbb3f150a675 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -17,6 +17,7 @@ ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef np.npy_intp SIZE_t              # Type for indices and counters
 ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
+ctypedef np.npy_uint64 UINT64_t          # Unsigned 64 bit integer
 
 cdef enum:
     # Max value for our rand_r replacement (near the bottom).
@@ -48,15 +49,19 @@ cdef np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size)
 
 
 cdef SIZE_t rand_int(SIZE_t low, SIZE_t high,
-                            UINT32_t* random_state) nogil
+                     UINT32_t* random_state) nogil
 
 
 cdef double rand_uniform(double low, double high,
-                                UINT32_t* random_state) nogil
+                         UINT32_t* random_state) nogil
 
 
 cdef double log(double x) nogil
 
+
+cdef bint goes_right(DTYPE_t sample_value, double threshold,
+                     SIZE_t n_categories, UINT64_t split_map) nogil
+
 # =============================================================================
 # Stack data structure
 # =============================================================================
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index a4ccc71946bd1..78be1ec245db3 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -85,6 +85,16 @@ cdef inline double log(double x) nogil:
     return ln(x) / ln(2.0)
 
 
+cdef inline bint goes_right(DTYPE_t sample_value, double threshold,
+                            SIZE_t n_categories, UINT64_t split_map) nogil:
+    """True if the sample value should go to right node."""
+    if n_categories > 0:
+        # Categorical feature; 1 in the bitmap refers to right node.
+        return (split_map >> (<SIZE_t> sample_value)) & 1
+    else:
+        # Continuous feature.
+        return sample_value > threshold
+
 # =============================================================================
 # Stack data structure
 # =============================================================================
diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py
index 43e8aa11b9611..eb1d509d32f1e 100644
--- a/sklearn/tree/export.py
+++ b/sklearn/tree/export.py
@@ -71,7 +71,8 @@ def export_graphviz(decision_tree, out_file=SENTINEL, max_depth=None,
                     feature_names=None, class_names=None, label='all',
                     filled=False, leaves_parallel=False, impurity=True,
                     node_ids=False, proportion=False, rotate=False,
-                    rounded=False, special_characters=False):
+                    rounded=False, special_characters=False,
+                    split_map=True):
     """Export a decision tree in DOT format.
 
     This function generates a GraphViz representation of the decision tree,
@@ -212,6 +213,16 @@ def node_to_str(tree, node_id, criterion):
                 node_string += 'node '
             node_string += characters[0] + str(node_id) + characters[4]
 
+        # Write categorical split map
+        if split_map:
+            if labels:
+                if tree.n_categories[node_id] > 0:
+                    # Categorical feature
+                    _n_categories = tree.n_categories[node_id]
+                    _split_map = str(bin(tree.split_maps[node_id]))[2:]
+                    node_string += "0" * (_n_categories - len(_split_map)) + \
+                                   _split_map + characters[4]
+
         # Write decision criteria
         if tree.children_left[node_id] != _tree.TREE_LEAF:
             # Always write node decision criteria, except for leaves
@@ -221,10 +232,15 @@ def node_to_str(tree, node_id, criterion):
                 feature = "X%s%s%s" % (characters[1],
                                        tree.feature[node_id],
                                        characters[2])
-            node_string += '%s %s %s%s' % (feature,
-                                           characters[3],
-                                           round(tree.threshold[node_id], 4),
-                                           characters[4])
+
+            if tree.n_categories[node_id] > 0:
+                # Categorical feature
+                node_string += '%s%s' % (feature, characters[4])
+            else:
+                node_string += '%s %s %s%s' % (feature,
+                                               characters[3],
+                                               round(tree.threshold[node_id], 4),
+                                               characters[4])
 
         # Write impurity
         if impurity:
diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py
index c3567e864c10b..faacbc797ff4a 100644
--- a/sklearn/tree/tree.py
+++ b/sklearn/tree/tree.py
@@ -61,11 +61,14 @@
                 "mae": _criterion.MAE}
 
 DENSE_SPLITTERS = {"best": _splitter.BestSplitter,
-                   "random": _splitter.RandomSplitter}
+                   "random": _splitter.RandomSplitter,
+                   "smart": _splitter.SmartSplitter}
 
 SPARSE_SPLITTERS = {"best": _splitter.BestSparseSplitter,
                     "random": _splitter.RandomSparseSplitter}
 
+MAX_CATEGORICAL_LABEL = 64
+
 # =============================================================================
 # Base decision tree
 # =============================================================================
@@ -115,7 +118,7 @@ def __init__(self,
         self.max_features_ = None
 
     def fit(self, X, y, sample_weight=None, check_input=True,
-            X_idx_sorted=None):
+            X_idx_sorted=None, categorical_features=None):
 
         random_state = check_random_state(self.random_state)
         if check_input:
@@ -310,6 +313,25 @@ def fit(self, X, y, sample_weight=None, check_input=True,
                              ".shape = {})".format(X.shape,
                                                    X_idx_sorted.shape))
 
+        if categorical_features:
+            if max(categorical_features) >= self.n_features_:
+                raise ValueError("Categorical feature indices are out of the "
+                                 "range; there are only {} features.".format(
+                    self.n_features_))
+
+            for f_idx in categorical_features:
+                uniq_values = set(map(int, X[:, f_idx]))
+
+                if max(uniq_values) + 1 > MAX_CATEGORICAL_LABEL:
+                    raise ValueError("The cardinality ({}) of feature {} is too "
+                                     "high > {}.".format(max(uniq_values) + 1, f_idx,
+                                                         MAX_CATEGORICAL_LABEL))
+
+                if set(uniq_values) != set(range(max(uniq_values) + 1)):
+                    raise ValueError("Categorical features should go through"
+                                     "LabelEncoder first.")
+
+        #
         # Build tree
         criterion = self.criterion
         if not isinstance(criterion, Criterion):
@@ -324,6 +346,8 @@ def fit(self, X, y, sample_weight=None, check_input=True,
 
         splitter = self.splitter
         if not isinstance(self.splitter, Splitter):
+            if self.splitter not in SPLITTERS:
+                raise ValueError("Unknown splitter '%s' in this use case." % self.splitter)
             splitter = SPLITTERS[self.splitter](criterion,
                                                 self.max_features_,
                                                 min_samples_leaf,
@@ -347,7 +371,10 @@ def fit(self, X, y, sample_weight=None, check_input=True,
                                            max_leaf_nodes,
                                            self.min_impurity_split)
 
-        builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
+        if categorical_features is not None:
+            categorical_features = np.asfortranarray(categorical_features)
+
+        builder.build(self.tree_, X, y, sample_weight, X_idx_sorted, categorical_features)
 
         if self.n_outputs_ == 1:
             self.n_classes_ = self.n_classes_[0]
@@ -696,7 +723,7 @@ def __init__(self,
             presort=presort)
 
     def fit(self, X, y, sample_weight=None, check_input=True,
-            X_idx_sorted=None):
+            X_idx_sorted=None, categorical_features=None):
         """Build a decision tree classifier from the training set (X, y).
 
         Parameters
@@ -726,6 +753,9 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             cached between trees. If None, the data will be sorted here.
             Don't use this parameter unless you know what to do.
 
+        categorical_features : array, optional
+            A list of indices of categorical features.
+
         Returns
         -------
         self : object
@@ -736,7 +766,8 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             X, y,
             sample_weight=sample_weight,
             check_input=check_input,
-            X_idx_sorted=X_idx_sorted)
+            X_idx_sorted=X_idx_sorted,
+            categorical_features=categorical_features)
         return self
 
 
@@ -834,8 +865,11 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
 
     splitter : string, optional (default="best")
         The strategy used to choose the split at each node. Supported
-        strategies are "best" to choose the best split and "random" to choose
-        the best random split.
+        strategies are
+            - "best" to choose the best split;
+            - "random" to choose the best random split;
+            - "smart" to choose the best split with smart treatment of
+            categorical features beased on local response proportion.
 
     max_features : int, float, string or None, optional (default=None)
         The number of features to consider when looking for the best split:
@@ -987,7 +1021,7 @@ def __init__(self,
             presort=presort)
 
     def fit(self, X, y, sample_weight=None, check_input=True,
-            X_idx_sorted=None):
+            X_idx_sorted=None, categorical_features=None):
         """Build a decision tree regressor from the training set (X, y).
 
         Parameters
@@ -1016,6 +1050,9 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             cached between trees. If None, the data will be sorted here.
             Don't use this parameter unless you know what to do.
 
+        categorical_features : array, optional
+            A list of indices of categorical features.
+
         Returns
         -------
         self : object
@@ -1026,7 +1063,8 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             X, y,
             sample_weight=sample_weight,
             check_input=check_input,
-            X_idx_sorted=X_idx_sorted)
+            X_idx_sorted=X_idx_sorted,
+            categorical_features=categorical_features)
         return self