8000 * Created new methods for decision path computation instead of overlo… · scikit-learn/scikit-learn@068f05b · GitHub
[go: up one dir, main page]

Skip to content

Commit 068f05b

Browse files
committed
* Created new methods for decision path computation instead of overloading
the predict method * Fixed a bug of decision path array being typed int32 instead of SIZE_t
1 parent 94c2f87 commit 068f05b

File tree

6 files changed

+352
-426
lines changed

6 files changed

+352
-426
lines changed

sklearn/ensemble/forest.py

Lines changed: 47 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ def _parallel_predict_proba(trees, X, n_classes, n_outputs):
136136

137137
def _parallel_predict_paths(trees, X):
138138
"""Private function used to compute a batch of prediction paths within a job."""
139-
return [tree.predict(X, return_paths = True) for tree in trees]
140-
139+
return [tree.decision_paths(X) for tree in trees]
140+
141141

142142
def _parallel_predict_regression(trees, X):
143143
"""Private function used to compute a batch of predictions within a job."""
@@ -303,6 +303,40 @@ def _validate_y(self, y):
303303
# Default implementation
304304
return y
305305

306+
def decision_paths(self, X):
307+
"""Predict class or regression value for X and return decision paths leading to the prediction, from every tree.
308+
309+
310+
Parameters
311+
----------
312+
X : array-like of shape = [n_samples, n_features]
313+
The input samples.
314+
315+
Returns
316+
-------
317+
y : list of arrays with shape = [n_estimators, n_samples, max_depth + 1]
318+
Decision paths for each each tree and for eachprediction.
319+
Each path is an array of node ids, starting with the root node id.
320+
If a path is shorter than max_depth + 1, it is padded with -1 on the right.
321+
"""
322+
323+
# Check data
324+
if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
325+
X = array2d(X, dtype=DTYPE)
326+
327+
# Assign chunk of trees to jobs
328+
n_jobs, n_trees, starts = _partition_estimators(self)
329+
330+
# Parallel loop
331+
path_list = Parallel(n_jobs=n_jobs, verbose=self.verbose,
332+
backend="threading")(
333+
delayed(_parallel_predict_paths)(
334+
self.estimators_[starts[i]:starts[i + 1]], X)
335+
for i in range(n_jobs))
336+
#unpack the nested list and return
337+
return [lst for med_lst in path_list for lst in med_lst]
338+
339+
306340
@property
307341
def feature_importances_(self):
308342
"""Return the feature importances (the higher, the more important the
@@ -404,7 +438,7 @@ def _validate_y(self, y):
404438

405439
return y
406440

407-
def predict(self, X, return_paths = True):
441+
def predict(self, X):
408442
"""Predict class for X.
409443
410444
The predicted class of an input sample is computed as the majority
@@ -420,25 +454,6 @@ def predict(self, X, return_paths = True):
420454
y : array of shape = [n_samples] or [n_samples, n_outputs]
421455
The predicted classes.
422456
"""
423-
424-
425-
if return_paths:
426-
# Check data
427-
if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
428-
X = array2d(X, dtype=DTYPE)
429-
430-
# Assign chunk of trees to jobs
431-
n_jobs, n_trees, starts = _partition_estimators(self)
432-
433-
# Parallel loop
434-
path_list = Parallel(n_jobs=n_jobs, verbose=self.verbose,
435-
backend="threading")(
436-
delayed(_parallel_predict_paths)(
437-
self.estimators_[starts[i]:starts[i + 1]], X)
438-
for i in range(n_jobs))
439-
#unpack the nested list and return
440-
return [lst for med_lst in path_list for lst in med_lst]
441-
442457
n_samples = len(X)
443458
proba = self.predict_proba(X)
444459

@@ -567,7 +582,7 @@ def __init__(self,
567582
random_state=random_state,
568583
verbose=verbose)
569584

570-
def predict(self, X, return_paths = False):
585+
def predict(self, X):
571586
"""Predict regression target for X.
572587
573588
The predicted regression target of an input sample is computed as the
@@ -591,27 +606,16 @@ def predict(self, X, return_paths = False):
591606
n_jobs, n_trees, starts = _partition_estimators(self)
592607

593608
# Parallel loop
594-
if return_paths:
595-
path_list = Parallel(n_jobs=n_jobs, verbose=self.verbose,
596-
backend="threading")(
597-
delayed(_parallel_predict_paths)(
598-
self.estimators_[starts[i]:starts[i + 1]], X)
599-
for i in range(n_jobs))
600-
#unpack the nested list and return
601-
return [lst for med_lst in path_list for lst in med_lst]
602-
else:
603-
all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose,
604-
backend="threading")(
605-
delayed(_parallel_predict_regression)(
606-
self.estimators_[starts[i]:starts[i + 1]], X)
607-
for i in range(n_jobs))
608-
# Reduce
609-
y_hat = sum(all_y_hat) / len(self.estimators_)
610-
611-
return y_hat
609+
all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose,
610+
backend="threading")(
611+
delayed(_parallel_predict_regression)(
612+
self.estimators_[starts[i]:starts[i + 1]], X)
613+
for i in range(n_jobs))
612614

613-
614-
615+
# Reduce
616+
y_hat = sum(all_y_hat) / len(self.estimators_)
617+
618+
return y_hat
615619

616620
def _set_oob_score(self, X, y):
617621
n_samples = y.shape[0]

0 commit comments

Comments
 (0)
0