8000 do not allow multiple feature for non-noisy data. Unittest for noisy … · filthysocks/scikit-learn@5d6bd5a · GitHub
[go: up one dir, main page]

Skip to content

Commit 5d6bd5a

Browse files
author
Jonas Seiler
committed
do not allow multiple feature for non-noisy data. Unittest for noisy data scikit-learn#3162
1 parent 8ad7472 commit 5d6bd5a

File tree

2 files changed

+131
-25
lines changed

2 files changed

+131
-25
lines changed

sklearn/gaussian_process/gaussian_process.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from . import correlation_models as correlation
1717

1818
MACHINE_EPSILON = np.finfo(np.double).eps
19+
DEFAULT_NUGGET = 10. * MACHINE_EPSILON
1920

2021

2122
def l1_cross_distances(X):
@@ -221,7 +222,7 @@ def __init__(self, regr='constant', corr='squared_exponential', beta0=None,
221222
storage_mode='full', verbose=False, theta0=1e-1,
222223
thetaL=None, thetaU=None, optimizer='fmin_cobyla',
223224
random_start=1, normalize=True,
224-
nugget=10. * MACHINE_EPSILON, random_state=None):
225+
nugget=DEFAULT_NUGGET, random_state=None):
225226

226227
self.regr = regr
227228
self.corr = corr
@@ -296,6 +297,9 @@ def fit(self, X, y):
296297

297298
# Calculate matrix of distances D between samples
298299
D, ij = l1_cross_distances(X)
300+
if self._prohibitMultipleInputFeature(D):
301+
raise Exception("Multiple input features cannot have the same"
302+
" value for non-noisy data.")
299303

300304
# Regression matrix and parameters
301305
F = self.regr(X)
@@ -521,6 +525,15 @@ def predict(self, X, eval_MSE=False, batch_size=None):
521525

522526
return y
523527

528+
def _prohibitMultipleInputFeature(self, D):
529+
"""
530+
Multiple input features are only allowed for noisy data
531+
"""
532+
return self.nugget <= DEFAULT_NUGGET and \
533+
np.min(np.sum(D, axis=1)) == 0. and \
534+
self.corr != correlation.pure_nugget
535+
536+
524537
def reduced_likelihood_function(self, theta=None):
525538
"""
526539
This function determines the BLUP parameters and evaluates the reduced
@@ -582,6 +595,8 @@ def reduced_likelihood_function(self, theta=None):
582595
if D is None:
583596
# Light storage mode (need to recompute D, ij and F)
584597
D, ij = l1_cross_distances(self.X)
598+
if self._prohibitMultipleInputFeature(D):
599+
raise Exception("Multiple X only make sense for noisy data")
585600
F = self.regr(self.X)
586601

587602
# Set up R

sklearn/gaussian_process/tests/test_gaussian_process.py

Lines changed: 115 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
from nose.tools import assert_true
1010

1111
import numpy as np
12+
import scipy.stats as ss
13+
14+
import pylab as pl
1215

1316
from sklearn.gaussian_process import GaussianProcess
1417
from sklearn.gaussian_process import regression_models as regression
@@ -39,7 +42,7 @@ def test_1d(regr=regression.constant, corr=correlation.squared_exponential,
3942
and np.allclose(MSE2, 0., atol=10))
4043

4144

42-
def test_2d(regr=regression.constant, corr=correlation.squared_exponential,
45+
def check2d(X, regr=regression.constant, corr=correlation.squared_exponential,
4346
random_start=10, beta0=None):
4447
"""
4548
MLE estimation of a two-dimensional Gaussian Process model accounting for
@@ -49,14 +52,7 @@ def test_2d(regr=regression.constant, corr=correlation.squared_exponential,
4952
"""
5053
b, kappa, e = 5., .5, .1
5154
g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2.
52-
X = np.array([[-4.61611719, -6.00099547],
53-
[4.10469096, 5.32782448],
54-
[0.00000000, -0.50000000],
55-
[-6.17289014, -4.6984743],
56-
[1.3109306, -6.93271427],
57-
[-5.03823144, 3.10584743],
58-
[-2.87600388, 6.74310541],
59-
[5.21301203, 4.26386883]])
55+
6056
y = g(X).ravel()
6157
gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
6258
theta0=[1e-2] * 2, thetaL=[1e-4] * 2,
@@ -68,7 +64,7 @@ def test_2d(regr=regression.constant, corr=correlation.squared_exponential,
6864
assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.))
6965

7066

71-
def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential,
67+
def check2d_2d(X, regr=regression.constant, corr=correlation.squared_exponential,
7268
random_start=10, beta0=None):
7369
"""
7470
MLE estimation of a two-dimensional Gaussian Process model accounting for
@@ -79,14 +75,7 @@ def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential,
7975
b, kappa, e = 5., .5, .1
8076
g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2.
8177
f = lambda x: np.vstack((g(x), g(x))).T
82-
X = np.array([[-4.61611719, -6.00099547],
83-
[4.10469096, 5.32782448],
84-
[ F438 0.00000000, -0.50000000],
85-
[-6.17289014, -4.6984743],
86-
[1.3109306, -6.93271427],
87-
[-5.03823144, 3.10584743],
88-
[-2.87600388, 6.74310541],
89-
[5.21301203, 4.26386883]])
78+
9079
y = f(X)
9180
gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
9281
theta0=[1e-2] * 2, thetaL=[1e-4] * 2,
@@ -112,26 +101,128 @@ def test_more_builtin_correlation_models(random_start=1):
112101
all_corr = ['absolute_exponential', 'squared_exponential', 'cubic',
113102
'linear']
114103

104+
X = np.array([[-4.61611719, -6.00099547],
105+
[4.10469096, 5.32782448],
106+
[0.00000000, -0.50000000],
107+
[-6.17289014, -4.6984743],
108+
[1.3109306, -6.93271427],
109+
[-5.03823144, 3.10584743],
110+
[-2.87600388, 6.74310541],
111+
[5.21301203, 4.26386883]])
112+
115113
for corr in all_corr:
116114
test_1d(regr='constant', corr=corr, random_start=random_start)
117-
test_2d(regr='constant', corr=corr, random_start=random_start)
118-
test_2d_2d(regr='constant', corr=corr, random_start=random_start)
115+
check2d(X, regr='constant', corr=corr, random_start=random_start)
116+
check2d_2d(X, regr='constant', corr=corr, random_start=random_start)
119117

120118

121119
def test_ordinary_kriging():
122120
"""
123121
Repeat test_1d and test_2d with given regression weights (beta0) for
124122
different regression models (Ordinary Kriging).
125123
"""
124+
125+
X = np.array([[-4.61611719, -6.00099547],
126+
[4.10469096, 5.32782448],
127+
[0.00000000, -0.50000000],
128+
[-6.17289014, -4.6984743],
129+
[1.3109306, -6.93271427],
130+
[-5.03823144, 3.10584743],
131+
[-2.87600388, 6.74310541],
132+
[5.21301203, 4.26386883]])
133+
126134
test_1d(regr='linear', beta0=[0., 0.5])
127135
test_1d(regr='quadratic', beta0=[0., 0.5, 0.5])
128-
test_2d(regr='linear', beta0=[0., 0.5, 0.5])
129-
test_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
130-
test_2d_2d(regr='linear', beta0=[0., 0.5, 0.5])
131-
test_2d_2d(regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
136+
check2d(X, regr='linear', beta0=[0., 0.5, 0.5])
137+
check2d(X, regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
138+
check2d_2d(X, regr='linear', beta0=[0., 0.5, 0.5])
139+
check2d_2d(X, regr='quadratic', beta0=[0., 0.5, 0.5, 0.5, 0.5, 0.5])
132140

133141

134142
def test_no_normalize():
135143
gp = GaussianProcess(normalize=False).fit(X, y)
136144
y_pred = gp.predict(X)
137145
assert_true(np.allclose(y_pred, y))
146+
147+
148+
@raises(Exception)
149+
def test_no_multiple_feature():
150+
'''
151+
check that multiple features are not allowed for non-noisy data
152+
'''
153+
X = np.array([[-4.61611719, -6.00099547],
154+
[4.10469096, 5.32782448],
155+
[0.00000000, -0.50000000],
156+
[-6.17289014, -4.6984743],
157+
[-6.17289014, -4.6984743],
158+
[1.3109306, -6.93271427],
159+
[-5.03823144, 3.10584743],
160+
[-2.87600388, 6.74310541],
161+
[5.21301203, 4.26386883]])
162+
163+
check2d(X)
164+
165+
166+
def check2dNoisy(X, regr=regression.constant, corr=correlation.squared_exponential,
167+
random_start=10, beta0=None):
168+
"""
169+
MLE estimation of a two-dimensional Gaussian Process model accounting for
170+
anisotropy. Check random start optimization.
171+
172+
Test the interpolating property.
173+
"""
174+
b, kappa, e = 5., .5, .1
175+
g = lambda x: b - x[:, 1] - kappa * (x[:, 0] - e) ** 2.
176+
177+
y = g(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
178+
gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
179+
theta0=[1e-2] * 2, thetaL=[1e-4] * 2,
180+
thetaU=[1.] * 2,
181+
random_start=random_start, verbose=False, nugget=0.1)
182+
gp.fit(X, y)
183+
y_pred, MSE = gp.predict(X, eval_MSE=True)
184+
185+
fig = pl.figure()
186+
pl.plot(X, f(X), 'r:', label=u'$f(x) = x\,\sin(x)$')
187+
pl.plot(X, y, 'r.', markersize=10, label=u'Observations')
188+
pl.plot(X, y_pred, 'b-', label=u'Prediction')
189+
pl.fill(np.concatenate([X, X[::-1]]),
190+
np.concatenate([y_pred - 1.9600 * MSE,
191+
(y_pred + 1.9600 * MSE)[::-1]]),
192+
alpha=.5, fc='b', ec='None', label='95% confidence interval')
193+
pl.xlabel('$x$')
194+
pl.ylabel('$f(x)$')
195+
pl.ylim(-10, 10)
196+
pl.legend(loc='upper left')
197+
pl.show()
198+
199+
print((np.abs(y_pred - y) <= ss.norm.ppf(0.999, y_pred, np.sqrt(MSE))))
200+
201+
assert_true((np.abs(y_pred - y) <= ss.norm.ppf(0.999, y_pred, np.sqrt(
202+
MSE))).all()) #check that difference between prediction and truth is smaller than 99.9% conf int.
203+
204+
205+
def test_1d_noisy(regr=regression.constant, corr=correlation.absolute_exponential,
206+
random_start=10, beta0=None):
207+
"""
208+
MLE estimation of a one-dimensional Gaussian Process model.
209+
Check random start optimization with noisy / duplicate inputs.
210+
211+
Test the interpolating property.
212+
"""
213+
214+
X = np.atleast_2d([1., 3., 5., 6., 7., 8., 9., 10.] * 2).T
215+
x = np.atleast_2d(np.linspace(0, 10, 50)).T
216+
217+
y = f(X).ravel() + np.random.normal(0, 0.1, len(X))
218+
219+
gp = GaussianProcess(regr=regr, corr=corr, beta0=beta0,
220+
theta0=1e-2, thetaL=1e-4, thetaU=1e-1,
221+
random_start=random_start, verbose=False, nugget=0.01).fit(X, y)
222+
y_pred, MSE = gp.predict(x, eval_MSE=True)
223+
224+
y = f(x).ravel()
225+
assert_true((np.abs(y_pred - y) <= np.abs(
226+
ss.norm.ppf(0.025, y_pred, np.sqrt(MSE))) ).all()) #check that true value is within 95% conf. int.
227+
228+

0 commit comments

Comments
 (0)
0