8000 add standard deviation calculation for linear regression by jl2922 · Pull Request #8872 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

add standard deviation calculation for linear regression #8872

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions sklearn/linear_model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,44 @@ def __init__(self, fit_intercept=True, normalize=False, copy_X=True,
self.copy_X = copy_X
self.n_jobs = n_jobs

def matrix_fit(self, X, y, n, p, dof, sample_weight):
# Preprocess data.
y = np.array(y, copy=True).reshape(n, -1)
if self.fit_intercept:
X = np.append(X, np.ones((n, 1)), axis=1)
dof = dof - 1
if sample_weight is not None:
sample_weight = np.sqrt(np.array(sample_weight, copy=True))
try:
sample_weight = sample_weight.reshape(n, 1)
X = np.multiply(X, sample_weight)
y = np.multiply(y, sample_weight)
except:
pass

# Evaluate coefficients and standard deviation.
XT = X.T
try:
XTX_inv = np.linalg.inv(np.dot(XT, X))
except linalg.LinAlgError:
self.coef_ = np.zeros(1)
self.intercept_ = np.zeros(1)
return self
beta = np.dot(np.dot(XTX_inv, XT), y)
intercept = beta[-1] if self.fit_intercept else np.zeros(
y.shape[1])
coef = beta[0:-1] if self.fit_intercept else beta
residual = y - np.dot(X, beta)
self.coef_ = coef.T if coef.shape[1] > 1 else coef.reshape(p,)
self.intercept_ = intercept.reshape(-1,)

if dof > 0:
diag = np.diagonal(XTX_inv)
var_beta = np.outer(diag, np.sum(residual**2, axis=0)) / dof
stdev = np.sqrt(var_beta).reshape(beta.shape)
self.stdev_ = \
stdev.T if coef.shape[1] > 1 else stdev.reshape(-1,)

def fit(self, X, y, sample_weight=None):
"""
Fit linear model.
Expand Down Expand Up @@ -480,6 +518,15 @@ def fit(self, X, y, sample_weight=None):
if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
raise ValueError("Sample weights must be 1D array or scalar")

if not sp.issparse(X) and not self.normalize:
X = np.array(X)
n = X.shape[0]
p = X.shape[1]
dof = n - p
if dof >= 0:
self.matrix_fit(X, y, n, p, dof, sample_weight)
return self

X, y, X_offset, y_offset, X_scale = self._preprocess_data(
X, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
copy=self.copy_X, sample_weight=sample_weight)
Expand Down
33 changes: 33 additions & 0 deletions sklearn/linear_model/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,39 @@ def test_linear_regression():
assert_array_almost_equal(reg.predict(X), [0])


def test_linear_regression_uncertainty():
# Test with a multivariate set.
X = [[5.0e-6, 2.5e-1],
[2.0e-6, 4.0e-2],
[1.0e-6, 1.0e-2],
[5.0e-7, 2.5e-3],
[2.0e-7, 4.0e-4],
[1.0e-7, 1.0e-4]]
Y = [-0.5930783034, -0.592875163, -0.5928283259,
-0.5928091337, -0.5928019288, -0.5928014142]

reg = LinearRegression()
reg.fit(X, Y)

# Compare with values from R.
assert_array_almost_equal(
reg.coef_, [-2.68764200535e+01, -5.90375272820e-04])
assert_array_almost_equal(reg.intercept_, [-5.92796478799e-01])
assert_array_almost_equal(
reg.stdev_, [2.10596119117, 3.97159995702e-05, 1.48683754360e-06])

# Test with sample weights.
X = [[6], [7], [8]]
y = [8000, 50000, 116000]
weight = [123, 123, 246]
reg = LinearRegression()
reg.fit(X, y, sample_weight=weight)

# Compare with values from R.
assert_array_almost_equal(reg.coef_, [55090.90909091])
assert_array_almost_equal(reg.intercept_, [-326909.09090909])
assert_array_almost_equal(reg.stdev_, [6171.11372672, 45032.21987029])

def test_linear_regression_sample_weights():
# TODO: loop over sparse data as well

Expand Down
0