27
27
from .base import BaseEnsemble
28
28
from ..base import ClassifierMixin , RegressorMixin
29
29
from ..tree import DecisionTreeClassifier , DecisionTreeRegressor
30
+ from ..tree .tree import BaseDecisionTree
30
31
from ..utils import check_arrays , check_random_state
31
32
from ..metrics import accuracy_score , r2_score
32
33
@@ -108,12 +109,20 @@ def fit(self, X, y, sample_weight=None):
108
109
self .estimator_weights_ = np .zeros (self .n_estimators , dtype = np .float )
109
110
self .estimator_errors_ = np .ones (self .n_estimators , dtype = np .float )
110
111
112
+ # Create argsorted X for fast tree induction
113
+ X_argsorted = None
114
+
115
+ if isinstance (self .base_estimator , BaseDecisionTree ):
116
+ X_argsorted = np .asfortranarray (
117
+ np .argsort (X .T , axis = 1 ).astype (np .int32 ).T )
118
+
111
119
for iboost in xrange (self .n_estimators ):
112
120
# Boosting step
113
121
sample_weight , estimator_weight , estimator_error = self ._boost (
114
122
iboost ,
115
123
X , y ,
116
- sample_weight )
124
+ sample_weight ,
125
+ X_argsorted = X_argsorted )
117
126
118
127
# Early termination
119
128
if sample_weight is None :
@@ -139,7 +148,7 @@ def fit(self, X, y, sample_weight=None):
139
148
return self
140
149
141
150
@abstractmethod
142
- def _boost (self , iboost , X , y , sample_weight ):
151
+ def _boost (self , iboost , X , y , sample_weight , X_argsorted = None ):
143
152
"""Implement a single boost.
144
153
145
154
Warning: This method needs to be overriden by subclasses.
@@ -158,6 +167,14 @@ def _boost(self, iboost, X, y, sample_weight):
158
167
sample_weight : array-like of shape = [n_samples]
159
168
The current sample weights.
160
169
170
+ X_argsorted : array-like, shape = [n_samples, n_features] (optional)
171
+ Each column of ``X_argsorted`` holds the row indices of ``X``
172
+ sorted according to the value of the corresponding feature
173
+ in ascending order.
174
+ The argument is supported to enable multiple decision trees
175
+ to share the data structure and to avoid re-computation in
176
+ tree ensembles. For maximum efficiency use dtype np.int32.
177
+
161
178
Returns
162
179
-------
163
180
sample_weight : array-like of shape = [n_samples] or None
@@ -367,7 +384,7 @@ def fit(self, X, y, sample_weight=None):
367
384
368
385
return super (AdaBoostClassifier , self ).fit (X , y , sample_weight )
369
386
370
- def _boost (self , iboost , X , y , sample_weight ):
387
+ def _boost (self , iboost , X , y , sample_weight , X_argsorted = None ):
371
388
"""Implement a single boost.
372
389
373
390
Perform a single boost according to the real multi-class SAMME.R
@@ -388,6 +405,14 @@ def _boost(self, iboost, X, y, sample_weight):
388
405
sample_weight : array-like of shape = [n_samples]
389
406
The current sample weights.
390
407
408
+ X_argsorted : array-like, shape = [n_samples, n_features] (optional)
409
+ Each column of ``X_argsorted`` holds the row indices of ``X``
410
+ sorted according to the value of the corresponding feature
411
+ in ascending order.
412
+ The argument is supported to enable multiple decision trees
413
+ to share the data structure and to avoid re-computation in
414
+ tree ensembles. For maximum efficiency use dtype np.int32.
415
+
391
416
Returns
392
417
-------
393
418
sample_weight : array-like of shape = [n_samples] or None
@@ -403,17 +428,24 @@ def _boost(self, iboost, X, y, sample_weight):
403
428
If None then boosting has terminated early.
404
429
"""
405
430
if self .algorithm == 'SAMME.R' :
406
- return self ._boost_real (iboost , X , y , sample_weight )
431
+ return self ._boost_real (iboost , X , y , sample_weight ,
432
+ X_argsorted = X_argsorted )
407
433
408
434
else : # elif self.algorithm == "SAMME":
409
- return self ._boost_discrete (iboost , X , y , sample_weight )
435
+ return self ._boost_discrete (iboost , X , y , sample_weight ,
436
+ X_argsorted = X_argsorted )
410
437
411
- def _boost_real (self , iboost , X , y , sample_weight ):
438
+ def _boost_real (self , iboost , X , y , sample_weight , X_argsorted = None ):
412
439
"""Implement a single boost using the SAMME.R real algorithm."""
413
440
estimator = self ._make_estimator ()
414
441
415
- y_predict_proba = estimator .fit (
416
- X , y , sample_weight = sample_weight ).predict_proba (X )
442
+ if X_argsorted is not None :
443
+ estimator .fit (
6D4E
X , y , sample_weight = sample_weight ,
444
+ X_argsorted = X_argsorted )
445
+ else :
446
+ estimator .fit (X , y , sample_weight = sample_weight )
447
+
448
+ y_predict_proba = estimator .predict_proba (X )
417
449
418
450
if iboost == 0 :
419
451
self .classes_ = getattr (estimator , 'classes_' , None )
@@ -464,12 +496,17 @@ def _boost_real(self, iboost, X, y, sample_weight):
464
496
465
497
return sample_weight , 1. , estimator_error
466
498
467
- def _boost_discrete (self , iboost , X , y , sample_weight ):
499
+ def _boost_discrete (self , iboost , X , y , sample_weight , X_argsorted = None ):
468
500
"""Implement a single boost using the SAMME discrete algorithm."""
469
501
estimator = self ._make_estimator ()
470
502
471
- y_predict = estimator .fit (
472
- X , y , sample_weight = sample_weight ).predict (X )
503
+ if X_argsorted is not None :
504
+ estimator .fit (X , y , sample_weight = sample_weight ,
505
+ X_argsorted = X_argsorted )
506
+ else :
507
+ estimator .fit (X , y , sample_weight = sample_weight )
508
+
509
+ y_predict = estimator .predict (X )
473
510
474
511
if iboost == 0 :
475
512
self .classes_ = getattr (estimator , 'classes_' , None )
@@ -875,7 +912,7 @@ def fit(self, X, y, sample_weight=None):
875
912
# Fit
876
913
return super (AdaBoostRegressor , self ).fit (X , y , sample_weight )
877
914
878
- def _boost (self , iboost , X , y , sample_weight ):
915
+ def _boost (self , iboost , X , y , sample_weight , X_argsorted = None ):
879
916
"""Implement a single boost for regression
880
917
881
918
Perform a single boost according to the AdaBoost.R2 algorithm and
@@ -896,6 +933,14 @@ def _boost(self, iboost, X, y, sample_weight):
896
933
sample_weight : array-like of shape = [n_samples]
897
934
The current sample weights.
898
935
936
+ X_argsorted : array-like, shape = [n_samples, n_features] (optional)
937
+ Each column of ``X_argsorted`` holds the row indices of ``X``
938
+ sorted according to the value of the corresponding feature
939
+ in ascending order.
940
+ The argument is supported to enable multiple decision trees
941
+ to share the data structure and to avoid re-computation in
942
+ tree ensembles. For maximum efficiency use dtype np.int32.
943
+
899
944
Returns
900
945
-------
901
946
sample_weight : array-like of shape = [n_samples] or None
@@ -925,8 +970,9 @@ def _boost(self, iboost, X, y, sample_weight):
925
970
926
971
# Fit on the bootstrapped sample and obtain a prediction
927
972
# for all samples in the training set
928
- y_predict = estimator .fit (
929
- X [bootstrap_idx ], y [bootstrap_idx ]).predict (X )
973
+ # X_argsorted is not used since bootstrap copies are used.
974
+ estimator .fit (X [bootstrap_idx ], y [bootstrap_idx ])
975
+ y_predict = estimator .predict (X )
930
976
931
977
error_vect = np .abs (y_predict - y )
932
978
error_max = error_vect .max ()
@@ -965,7 +1011,6 @@ def _boost(self, iboost, X, y, sample_weight):
965
1011
return sample_weight , estimator_weight , estimator_error
966
1012
967
1013
def _get_median_predict (self , X , limit = - 1 ):
968
-
969
1014
if not self .estimators_ :
970
1015
raise RuntimeError (
971
1016
("{0} is not initialized. "
0 commit comments