8000 rename node.sample_mask to node.terminal_region · scikit-learn/scikit-learn@d63f39c · GitHub
[go: up one dir, main page]

Skip to content

Commit d63f39c

Browse files
committed
rename node.sample_mask to node.terminal_region
renamed BernoulliLoss to BinomialLoss and rm old BinomialLoss
1 parent 5fe9b26 commit d63f39c

File tree

5 files changed

+540
-582
lines changed

5 files changed

+540
-582
lines changed

benchmarks/bench_gbrt.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,13 @@ def wrapper(*args, **kargs):
1919
return wrapper
2020

2121

22-
np.seterr(invalid='print', under='print', divide='print', over='print')
22+
# ignore overflows due to exp
23+
np.seterr(invalid='print', under='print', divide='print', over='ignore')
2324

2425

25-
classification_params = {'loss': 'bernoulli', 'n_iter': 100,
26+
classification_params = {'loss': 'deviance', 'n_iter': 100,
2627
'min_split': 1, 'max_depth': 1,
27-
'learn_rate': 1.0, 'subsample': 1.0}
28+
'learn_rate': .6, 'subsample': 1.0}
2829

2930

3031
@repeat
@@ -69,7 +70,7 @@ def random_gaussian_learning_curve(random_state=None):
6970
y_train, y_test = y[:2000], y[2000:]
7071

7172
n_iter = 2000
72-
max_depth = 1
73+
max_depth = 2
7374

7475
deviance = np.zeros((n_iter,), dtype=np.float64)
7576
error_rate = np.zeros((n_iter,), dtype=np.float64)
@@ -84,18 +85,20 @@ def monitor(clf, i):
8485
tmp = ((1.0 / (1.0 + np.exp(-y_pred))) >= 0.5).astype(np.float64)
8586
error_rate[i] = np.mean(tmp != y_test)
8687

87-
gbrt = GradientBoostingClassifier(loss='bernoulli', n_iter=n_iter, min_split=1,
88+
print "Train first model..."
89+
gbrt = GradientBoostingClassifier(loss='deviance', n_iter=n_iter, min_split=1,
8890
max_depth=max_depth, learn_rate=1.0,
8991
subsample=1.0)
9092
gbrt.fit(X_train, y_train, monitor=monitor)
9193
n = deviance.shape[0]
92-
print deviance[:5]
94+
9395
pl.subplot(122)
9496
pl.plot(np.arange(n), error_rate, "r-", label="No shrinkage")
9597
pl.subplot(121)
9698
pl.plot(np.arange(n), deviance, "r-", label="No shrinkage")
9799

98-
gbrt = GradientBoostingClassifier(loss='bernoulli', n_iter=n_iter, min_split=1,
100+
print "Train second model..."
101+
gbrt = GradientBoostingClassifier(loss='deviance', n_iter=n_iter, min_split=1,
99102
max_depth=max_depth, learn_rate=0.2,
100103
subsample=1.0)
101104
gbrt.fit(X_train, y_train, monitor=monitor)
@@ -230,14 +233,14 @@ def bench_friedman3(random_state=None):
230233

231234
if __name__ == "__main__":
232235

233-
print "spam", bench_spam()
236+
print "spam", bench_spam()
234237

235238
## print "Example 10.2 - LC"
236-
## random_gaussian_learning_curve()
237-
print "Example 10.2", bench_random_gaussian()
239+
## random_gaussian_learning_curve(13)
240+
## print "Example 10.2", bench_random_gaussian()
238241

239-
print "Madelon", bench_madelon()
240-
print "Arcene", bench_arcene()
242+
## print "Madelon", bench_madelon()
243+
## print "Arcene", bench_arcene()
241244

242245
## print "Boston", bench_boston()
243246
## print "Friedman#1", bench_friedman1()

sklearn/ensemble/gradient_boosting.py

Lines changed: 21 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -76,22 +76,6 @@ def predict(self, X):
7676
return y
7777

7878

79-
class ClassPrior2Predictor(object):
80-
"""A simple initial estimator that predicts the mean
81-
of the training targets.
82-
"""
83-
84-
prior = None
85-
86-
def fit(self, X, y):
87-
self.prior = np.log(y.sum() / float(y.shape[0] - y.sum()))
88-
89-
def predict(self, X):
90-
y = np.empty((X.shape[0],), dtype=np.float64)
91-
y.fill(self.prior)
92-
return y
93-
94-
9579
class ClassPriorPredictor(object):
9680
"""A simple initial estimator that predicts the mean
9781
of the training targets.
@@ -100,8 +84,7 @@ class ClassPriorPredictor(object):
10084
prior = None
10185

10286
def fit(self, X, y):
103-
pos_prior = y[y == -1].shape[0] / float(y.shape[0])
104-
self.prior = 0.5 * np.log2(pos_prior / (1.0 - pos_prior))
87+
self.prior = np.log(y.sum() / float(y.shape[0] - y.sum()))
10588

10689
def predict(self, X):
10790
y = np.empty((X.shape[0],), dtype=np.float64)
@@ -170,8 +153,8 @@ def negative_gradient(self, y, pred):
170153

171154
def _update_terminal_region(self, node, X, y, residual, pred):
172155
"""LAD updates terminal regions to median estimates. """
173-
node.value = np.asanyarray(np.median(y.take(node.sample_mask, axis=0) - \
174-
pred.take(node.sample_mask, axis=0)))
156+
node.value = np.asanyarray(np.median(y.take(node.terminal_region, axis=0) - \
157+
pred.take(node.terminal_region, axis=0)))
175158

176159

177160
## class HuberError(LossFunction):
@@ -189,14 +172,14 @@ def _update_terminal_region(self, node, X, y, residual, pred):
189172
## def _update_terminal_region(self, node, X, y, residual, pred):
190173
## """LAD updates terminal regions to median estimates. """
191174
## ## FIXME copied from LAD, still TODO
192-
## node.value = np.asanyarray(np.median(y.take(node.sample_mask, axis=0) - \
193-
## pred.take(node.sample_mask, axis=0)))
175+
## node.value = np.asanyarray(np.median(y.take(node.terminal_region, axis=0) - \
176+
## pred.take(node.terminal_region, axis=0)))
194177

195178

196-
class BernoulliDeviance(LossFunction):
179+
class BinomialDeviance(LossFunction):
197180

198181
def init_estimator(self):
199-
return ClassPrior2Predictor()
182+
return ClassPriorPredictor()
200183

201184
def __call__(self, y, pred):
202185
"""Compute the deviance (= negative log-likelihood). """
@@ -207,45 +190,26 @@ def negative_gradient(self, y, pred):
207190

208191
def _update_terminal_region(self, node, X, y, residual, pred):
209192
"""Make a single Newton-Raphson step. """
210-
residual = residual.take(node.sample_mask, axis=0)
211-
y = y.take(node.sample_mask, axis=0)
212-
213-
node.value = np.asanyarray(residual.sum() / \
214-
np.sum((y - residual) * (1.0 - y + residual)),
215-
dtype=np.float64)
216193

217-
# FIXME free mem - rename `sample_mask` since its actually an index arr
218-
del node.sample_mask
219-
node.sample_mask = None
220-
221-
222-
class BinomialDeviance(LossFunction):
223-
224-
def init_estimator(self):
225-
return ClassPriorPredictor()
226-
227-
def __call__(self, y, pred):
228-
return np.log2(1.0 + np.exp(-2.0 * y * pred))
194+
residual = residual.take(node.terminal_region, axis=0)
195+
y = y.take(node.terminal_region, axis=0)
229196

230-
def negative_gradient(self, y, pred):
231-
return (2.0 * y) / (1.0 + np.exp(2.0 * y * pred))
197+
numerator = residual.sum()
198+
denominator = np.sum((y - residual) * (1.0 - y + residual))
232199

233-
def _update_terminal_region(self, node, X, y, residual, pred):
234-
"""Make a single Newton-Raphson step. """
235-
targets = residual.take(node.sample_mask, axis=0)
236-
abs_targets = np.abs(targets)
237-
node.value = np.asanyarray(targets.sum() / np.sum(abs_targets * \
238-
(2.00000001 - abs_targets)))
200+
if denominator == 0.0:
201+
node.value = np.array(0.0, dtype=np.float64)
202+
else:
203+
node.value = np.asanyarray(numerator / denominator, dtype=np.float64)
239204

240205
# FIXME free mem - rename `sample_mask` since its actually an index arr
241-
del node.sample_mask
242-
node.sample_mask = None
206+
del node.terminal_region
207+
node.terminal_region = None
243208

244209

245210
LOSS_FUNCTIONS = {'ls': LeastSquaresError,
246211
'lad': LeastAbsoluteError,
247-
'deviance': BinomialDeviance,
248-
'bernoulli': BernoulliDeviance}
212+
'deviance': BinomialDeviance}
249213

250214

251215
class BaseGradientBoosting(BaseEstimator):
@@ -357,14 +321,14 @@ def fit(self, X, y, monitor=None):
357321
#print "Iteration %d - build_tree - in %fs" % (i, time() - t0)
358322

359323

360-
#assert tree.is_leaf == False
361-
324+
assert tree.is_leaf != True
325+
362326
loss.update_terminal_regions(tree, X, y, residual, y_pred)
363327
#print "Iteration %d - update - in %fs" % (i, time() - t0)
364328
self.trees.append(tree)
365329

366330
y_pred = self._predict(X, old_pred=y_pred)
367-
331+
368332
if monitor:
369333
monitor(self, i)
370334

0 commit comments

Comments
 (0)
0