8000 [MRG+2] Adding return_std options for models in linear_model/bayes.py… · NelleV/scikit-learn@67a85b8 · GitHub
[go: up one dir, main page]

Skip to content

Commit 67a85b8

Browse files
sergeyfamueller
authored andcommitted
[MRG+2] Adding return_std options for models in linear_model/bayes.py (scikit-learn#7838)
* initial commit for return_std * initial commit for return_std * adding tests, examples, ARD predict_std * adding tests, examples, ARD predict_std * a smidge more documentation * a smidge more documentation * Missed a few PEP8 issues * Changing predict_std to return_std #1 * Changing predict_std to return_std #2 * Changing predict_std to return_std #3 * Changing predict_std to return_std final * adding better plots via polynomial regression * trying to fix flake error * fix to ARD plotting issue * fixing some flakes * Two blank lines part 1 * Two blank lines part 2 * More newlines! * Even more newlines * adding info to the doc string for the two plot files * Rephrasing "polynomial" for Bayesian Ridge Regression * Updating "polynomia" for ARD * Adding more formal references * Another asked-for improvement to doc string. * Fixing flake8 errors * Cleaning up the tests a smidge. * A few more flakes * requested fixes from Andy * Mini bug fix * Final pep8 fix * pep8 fix round 2 * Fix beta_ to alpha_ in the comments
1 parent c5c3851 commit 67a85b8

File tree

4 files changed

+203
-9
lines changed

4 files changed

+203
-9
lines changed

examples/linear_model/plot_ard.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@
1515
1616
The estimation of the model is done by iteratively maximizing the
1717
marginal log-likelihood of the observations.
18+
19+
We also plot predictions and uncertainties for ARD
20+
for one dimensional regression using polynomial feature expansion.
21+
Note the uncertainty starts going up on the right side of the plot.
22+
This is because these test samples are outside of the range of the training
23+
samples.
1824
"""
1925
print(__doc__)
2026

@@ -54,8 +60,8 @@
5460
ols.fit(X, y)
5561

5662
###############################################################################
57-
# Plot the true weights, the estimated weights and the histogram of the
58-
# weights
63+
# Plot the true weights, the estimated weights, the histogram of the
64+
# weights, and predictions with standard deviations
5965
plt.figure(figsize=(6, 5))
6066
plt.title("Weights of the model")
6167
plt.plot(clf.coef_, color='darkblue', linestyle='-', linewidth=2,
@@ -81,4 +87,30 @@
8187
plt.plot(clf.scores_, color='navy', linewidth=2)
8288
plt.ylabel("Score")
8389
plt.xlabel("Iterations")
90+
91+
92+
# Plotting some predictions for polynomial regression
93+
def f(x, noise_amount):
94+
y = np.sqrt(x) * np.sin(x)
95+
noise = np.random.normal(0, 1, len(x))
96+
return y + noise_amount * noise
97+
98+
99+
degree = 10
100+
X = np.linspace(0, 10, 100)
101+
y = f(X, noise_amount=1)
102+
clf_poly = ARDRegression(threshold_lambda=1e5)
103+
clf_poly.fit(np.vander(X, degree), y)
104+
105+
X_plot = np.linspace(0, 11, 25)
106+
y_plot = f(X_plot, noise_amount=0)
107+
y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
108+
plt.figure(figsize=(6, 5))
109+
plt.errorbar(X_plot, y_mean, y_std, color='navy',
110+
label="Polynomial ARD", linewidth=2)
111+
plt.plot(X_plot, y_plot, color='gold', linewidth=2,
112+
label="Ground Truth")
113+
plt.ylabel("Output y")
114+
plt.xlabel("Feature X")
115+
plt.legend(loc="lower left")
84116
plt.show()

examples/linear_model/plot_bayesian_ridge.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@
1515
1616
The estimation of the model is done by iteratively maximizing the
1717
marginal log-likelihood of the observations.
18+
19+
We also plot predictions and uncertainties for Bayesian Ridge Regression
20+
for one dimensional regression using polynomial feature expansion.
21+
Note the uncertainty starts going up on the right side of the plot.
22+
This is because these test samples are outside of the range of the training
23+
samples.
1824
"""
1925
print(__doc__)
2026

@@ -51,7 +57,8 @@
5157
ols.fit(X, y)
5258

5359
###############################################################################
54-
# Plot true weights, estimated weights and histogram of the weights
60+
# Plot true weights, estimated weights, histogram of the weights, and
61+
# predictions with standard deviations
5562
lw = 2
5663
plt.figure(figsize=(6, 5))
5764
plt.title("Weights of the model")
@@ -77,4 +84,30 @@
7784
plt.plot(clf.scores_, color='navy', linewidth=lw)
7885
plt.ylabel("Score")
7986
plt.xlabel("Iterations")
87+
88+
89+
# Plotting some predictions for polynomial regression
90+
def f(x, noise_amount):
91+
y = np.sqrt(x) * np.sin(x)
92+
noise = np.random.normal(0, 1, len(x))
93+
return y + noise_amount * noise
94+
95+
96+
degree = 10
97+
X = np.linspace(0, 10, 100)
98+
y = f(X, noise_amount=0.1)
99+
clf_poly = BayesianRidge()
100+
clf_poly.fit(np.vander(X, degree), y)
101+
102+
X_plot = np.linspace(0, 11, 25)
103+
y_plot = f(X_plot, noise_amount=0)
104+
y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
105+
plt.figure(figsize=(6, 5))
106+
plt.errorbar(X_plot, y_mean, y_std, color='navy',
107+
label="Polynomial Bayesian Ridge Regression", linewidth=lw)
108+
plt.plot(X_plot, y_plot, color='gold', linewidth=lw,
109+
label="Ground Truth")
110+
plt.ylabel("Output y")
111+
plt.xlabel("Feature X")
112+
plt.legend(loc="lower left")
80113
plt.show()

sklearn/linear_model/bayes.py

Lines changed: 103 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,9 @@ class BayesianRidge(LinearModel, RegressorMixin):
9191
lambda_ : float
9292
estimated precision of the weights.
9393
94+
sigma_ : array, shape = (n_features, n_features)
95+
estimated variance-covariance matrix of the weights
96+
9497
scores_ : float
9598
if computed, value of the objective function (to be maximized)
9699
@@ -109,6 +112,16 @@ class BayesianRidge(LinearModel, RegressorMixin):
109112
Notes
110113
-----
111114
See examples/linear_model/plot_bayesian_ridge.py for an example.
115+
116+
References
117+
----------
118+
D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
119+
Vol. 4, No. 3, 1992.
120+
121+
R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
122+
http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
123+
Their beta is our self.alpha_
124+
Their alpha is our self.lambda_
112125
"""
113126

114127
def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
@@ -142,8 +155,10 @@ def fit(self, X, y):
142155
self : returns an instance of self.
143156
"""
144157
X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
145-
X, y, X_offset, y_offset, X_scale = self._preprocess_data(
158+
X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
146159
X, y, self.fit_intercept, self.normalize, self.copy_X)
160+
self.X_offset_ = X_offset_
161+
self.X_scale_ = X_scale_
147162
n_samples, n_features = X.shape
148163

149164
# Initialization of the values of the parameters
@@ -171,7 +186,8 @@ def fit(self, X, y):
171186
# coef_ = sigma_^-1 * XT * y
172187
if n_samples > n_features:
173188
coef_ = np.dot(Vh.T,
174-
Vh / (eigen_vals_ + lambda_ / alpha_)[:, None])
189+
Vh / (eigen_vals_ +
190+
lambda_ / alpha_)[:, np.newaxis])
175191
coef_ = np.dot(coef_, XT_y)
176192
if self.compute_score:
177193
logdet_sigma_ = - np.sum(
@@ -216,10 +232,45 @@ def fit(self, X, y):
216232
self.alpha_ = alpha_
217233
self.lambda_ = lambda_
218234
self.coef_ = coef_
235+
sigma_ = np.dot(Vh.T,
236+
Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis])
237+
self.sigma_ = (1. / alpha_) * sigma_
219238

220-
self._set_intercept(X_offset, y_offset, X_scale)
239+
self._set_intercept(X_offset_, y_offset_, X_scale_)
221240
return self
222241

242+
def predict(self, X, return_std=False):
243+
"""Predict using the linear model.
244+
245+
In addition to the mean of the predictive distribution, also its
246+
standard deviation can be returned.
247+
248+
Parameters
249+
----------
250+
X : {array-like, sparse matrix}, shape = (n_samples, n_features)
251+
Samples.
252+
253+
return_std : boolean, optional
254+
Whether to return the standard deviation of posterior prediction.
255+
256+
Returns
257+
-------
258+
y_mean : array, shape = (n_samples,)
259+
Mean of predictive distribution of query points.
260+
261+
y_std : array, shape = (n_samples,)
262+
Standard deviation of predictive distribution of query points.
263+
"""
264+
y_mean = self._decision_function(X)
265+
if return_std is False:
266+
return y_mean
267+
else:
268+
if self.normalize:
269+
X = (X - self.X_offset_) / self.X_scale_
270+
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
271+
y_std = np.sqrt(sigmas_squared_data + (1. / self.alpha_))
272+
return y_mean, y_std
273+
223274

224275
###############################################################################
225276
# ARD (Automatic Relevance Determination) regression
@@ -323,6 +374,19 @@ class ARDRegression(LinearModel, RegressorMixin):
323374
Notes
324375
--------
325376
See examples/linear_model/plot_ard.py for an example.
377+
378+
References
379+
----------
380+
D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
381+
competition, ASHRAE Transactions, 1994.
382+
383+
R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
384+
http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
385+
Their beta is our self.alpha_
386+
Their alpha is our self.lambda_
387+
ARD is a little different than the slide: only dimensions/features for
388+
which self.lambda_ < self.threshold_lambda are kept and the rest are
389+
discarded.
326390
"""
327391

328392
def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
@@ -365,7 +429,7 @@ def fit(self, X, y):
365429
n_samples, n_features = X.shape
366430
coef_ = np.zeros(n_features)
367431

368-
X, y, X_offset, y_offset, X_scale = self._preprocess_data(
432+
X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
369433
X, y, self.fit_intercept, self.normalize, self.copy_X)
370434

371435
# Launch the convergence loop
@@ -417,7 +481,7 @@ def fit(self, X, y):
417481
s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
418482
s += alpha_1 * log(alpha_) - alpha_2 * alpha_
419483
s += 0.5 * (fast_logdet(sigma_) + n_samples * log(alpha_) +
420-
np.sum(np.log(lambda_)))
484+
np.sum(np.log(lambda_)))
421485
s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_ ** 2).sum())
422486
self.scores_.append(s)
423487

@@ -432,5 +496,38 @@ def fit(self, X, y):
432496
self.alpha_ = alpha_
433497
self.sigma_ = sigma_
434498
self.lambda_ = lambda_
435-
self._set_intercept(X_offset, y_offset, X_scale)
499+
self._set_intercept(X_offset_, y_offset_, X_scale_)
436500
return self
501+
502+
def predict(self, X, return_std=False):
503+
"""Predict using the linear model.
504+
505+
In addition to the mean of the predictive distribution, also its
506+
standard deviation can be returned.
507+
508+
Parameters
509+
----------
510+
X : {array-like, sparse matrix}, shape = (n_samples, n_features)
511+
Samples.
512+
513+
return_std : boolean, optional
514+
Whether to return the standard deviation of posterior prediction.
515+
516+
Returns
517+
-------
518+
y_mean : array, shape = (n_samples,)
519+
Mean of predictive distribution of query points.
520+
521+
y_std : array, shape = (n_samples,)
522+
Standard deviation of predictive distribution of query points.
523+
"""
524+
y_mean = self._decision_function(X)
525+
if return_std is False:
526+
return y_mean
527+
else:
528+
if self.normalize:
529+
X = (X - self.X_offset_) / self.X_scale_
530+
X = X[:, self.lambda_ < self.threshold_lambda]
531+
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
532+
y_std = np.sqrt(sigmas_squared_data + (1. / self.alpha_))
533+
return y_mean, y_std

sklearn/linear_model/tests/test_bayes.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,35 @@ def test_toy_ard_object():
5656
# Check that the model could approximately learn the identity function
5757
test = [[1], [3], [4]]
5858
assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
59+
60+
61+
def test_return_std():
62+
# Test return_std option for both Bayesian regressors
63+
def f(X):
64+
return np.dot(X, w) + b
65+
66+
def f_noise(X, noise_mult):
67+
return f(X) + np.random.randn(X.shape[0])*noise_mult
68+
69+
d = 5
70+
n_train = 50
71+
n_test = 10
72+
73+
w = np.array([1.0, 0.0, 1.0, -1.0, 0.0])
74+
b = 1.0
75+
76+
X = np.random.random((n_train, d))
77+
X_test = np.random.random((n_test, d))
78+
79+
for decimal, noise_mult in enumerate([1, 0.1, 0.01]):
80+
y = f_noise(X, noise_mult)
81+
82+
m1 = BayesianRidge()
83+
m1.fit(X, y)
84+
y_mean1, y_std1 = m1.predict(X_test, return_std=True)
85+
assert_array_almost_equal(y_std1, noise_mult, decimal=decimal)
86+
87+
m2 = ARDRegression()
88+
m2.fit(X, y)
89+
y_mean2, y_std2 = m2.predict(X_test, return_std=True)
90+
assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)

0 commit comments

Comments
 (0)
0