8000 normalized param deprecated + test fixed · scikit-learn/scikit-learn@682b219 · GitHub
[go: up one dir, main page]

Skip to content
Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 682b219

Browse files
author
giorgiop
committed
normalized param deprecated + test fixed
1 parent e8d2b93 commit 682b219

File tree

2 files changed

+167
-51
lines changed

2 files changed

+167
-51
lines changed

sklearn/linear_model/base.py

Lines changed: 120 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,10 @@
3434
from ..utils.seq_dataset import ArrayDataset, CSRDataset
3535

3636

37-
###
38-
### TODO: intercept for all models
39-
### We should define a common function to center data instead of
40-
### repeating the same code inside each fit method.
37+
# TODO: intercept for all models
4138

42-
### TODO: bayesian_ridge_regression and bayesian_regression_ard
43-
### should be squashed into its respective objects.
39+
# TODO: bayesian_ridge_regression and bayesian_regression_ard
40+
# should be squashed into its respective objects.
4441

4542
SPARSE_INTERCEPT_DECAY = 0.01
4643
# For sparse data intercept updates are scaled by this decay factor to avoid
@@ -69,12 +66,9 @@ def make_dataset(X, y, sample_weight, random_state=None):
6966
return dataset, intercept_decay
7067

7168

72-
def sparse_center_data(X, y, fit_intercept, normalize=False):
73-
"""
74-
Compute information needed to center data to have mean zero along
75-
axis 0. Be aware that X will not be centered since it would break
76-
the sparsity, but will be normalized if asked so.
77-
"""
69+
# TODO: this reproduces the behavior prior 0.17
70+
# Must be remove in 0.19
71+
def _sparse_center_data(X, y, fit_intercept, normalize=None):
7872
if fit_intercept:
7973
# we might require not to change the csr matrix sometimes
8074
# store a copy if normalize is True.
@@ -106,15 +100,96 @@ def sparse_center_data(X, y, fit_intercept, normalize=False):
106100
return X, y, X_mean, y_mean, X_std
107101

108102

109-
def center_data(X, y, fit_intercept, normalize=False, copy=True,
110-
sample_weight=None):
103+
def sparse_center_data(X, y, fit_intercept, standardize=False,
104+
normalize=None):
105+
"""
106+
Compute information needed to center data to have mean zero along
107+
axis 0. Be aware that X will not be centered since it would break
108+
the sparsity, but will be standardized if asked so.
109+
"""
110+
if normalize is not None:
111+
warnings.warn("The `normalize` parameter is not in use anymore from "
112+
"version 0.17 and will be removed in 0.19. If you want "
113+
"to standardize the data instead, use"
114+
"`standardize=True`", DeprecationWarning)
115+
return _sparse_center_data(X, y, fit_intercept, normalize)
116+
117+
if fit_intercept:
118+
# we might require not to change the csr matrix sometimes
119+
# store a copy if standardize is True.
120+
# Change dtype to float64 since mean_variance_axis accepts
121+
# it that way.
122+
if sp.isspmatrix(X) and X.getformat() == 'csr':
123+
X = sp.csr_matrix(X, copy=standardize, dtype=np.float64)
124+
else:
125+
X = sp.csc_matrix(X, copy=standardize, dtype=np.float64)
126+
127+
X_mean, X_var = mean_variance_axis(X, axis=0)
128+
if standardize:
129+
# transform variance to std in-place
130+
X_std = np.sqrt(X_var, X_var)
131+
del X_var
132+
X_std[X_std == 0] = 1
133+
inplace_column_scale(X, 1. / X_std)
134+
else:
135+
X_std = np.ones(X.shape[1])
136+
y_mean = y.mean(axis=0)
137+
y = y - y_mean
138+
else:
139+
X_mean = np.zeros(X.shape[1])
140+
X_std = np.ones(X.shape[1])
141+
y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
142+
143+
return X, y, X_mean, y_mean, X_std
144+
145+
146+
# TODO: this reproduces the behavior prior 0.17
147+
# Must be remove in 0.19
148+
def _center_data(X, y, fit_intercept, normalize=False, copy=True,
149+
sample_weight=None):
150+
151+
X = as_float_array(X, copy)
152+
if fit_intercept:
153+
if isinstance(sample_weight, numbers.Number):
154+
sample_weight = None
155+
if sp.issparse(X):
156+
X_mean = np.zeros(X.shape[1])
157+
X_std = np.ones(X.shape[1])
158+
else:
159+
X_mean = np.average(X, axis=0, weights=sample_weight)
160+
X -= X_mean
161+
if normalize:
162+
# XXX: currently scaled to variance=n_samples
163+
X_std = np.sqrt(np.sum(X ** 2, axis=0))
164+
X_std[X_std == 0] = 1
165+
X /= X_std
166+
else:
167+
X_std = np.ones(X.shape[1])
168+
y_mean = np.average(y, axis=0, weights=sample_weight)
169+
y = y - y_mean
170+
else:
171+
X_mean = np.zeros(X.shape[1])
172+
X_std = np.ones(X.shape[1])
173+
y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
174+
return X, y, X_mean, y_mean, X_std
175+
176+
177+
def center_data(X, y, fit_intercept, standardize=False, normalize=None,
178+
copy=True, sample_weight=None):
111179
"""
112180
Centers data to have mean zero along axis 0. This is here because
113181
nearly all linear models will want their data to be centered.
114182
115183
If sample_weight is not None, then the weighted mean of X and y
116184
is zero, and not the mean itself
117185
"""
186+
if normalize is not None:
187+
warnings.warn("The `normalize` parameter is not in use anymore from "
188+
"version 0.17 and will be removed in 0.19. If you want "
189+
"to standardize the data instead, use"
190+
"`standardize=True`", DeprecationWarning)
191+
return _center_data(X, y, fit_intercept, normalize, copy, sample_weight)
192+
118193
X = as_float_array(X, copy)
119194
if fit_intercept:
120195
if isinstance(sample_weight, numbers.Number):
@@ -125,9 +200,8 @@ def center_data(X, y, fit_intercept, normalize=False, copy=True,
125200
else:
126201
X_mean = np.average(X, axis=0, weights=sample_weight)
127202
X -= X_mean
128-
if normalize:
129-
# XXX: currently scaled to variance=n_samples
130-
X_std = np.sqrt(np.sum(X ** 2, axis=0))
203+
if standardize:
204+
X_std = np.sqrt(np.mean(X ** 2, axis=0))
131205
X_std[X_std == 0] = 1
132206
X /= X_std
133207
else:
@@ -356,8 +430,8 @@ class LinearRegression(LinearModel, RegressorMixin):
356430
to false, no intercept will be used in calculations
357431
(e.g. data is expected to be already centered).
358432
359-
normalize : boolean, optional, default False
360-
If True, the regressors X will be normalized before regression.
433+
standardize : boolean, optional, default False
434+
If True, the regressors X will be standardized before regression.
361435
362436
copy_X : boolean, optional, default True
363437
If True, X will be copied; else, it may be overwritten.
@@ -385,13 +459,26 @@ class LinearRegression(LinearModel, RegressorMixin):
385459
386460
"""
387461

388-
def __init__(self, fit_intercept=True, normalize=False, copy_X=True,
389-
n_jobs=1):
462+
def __init__(self, fit_intercept=True, standardize=False, normalize=None,
463+
copy_X=True, n_jobs=1):
464+
if normalize is not None:
465+
warnings.warn("The `normalize` parameter is not in use anymore "
466+
"from version 0.17 and will be removed in 0.19. If "
467+
"you want the data to be standardized instead, use "
468+
"`standardize=True`", DeprecationWarning)
390469
self.fit_intercept = fit_intercept
391-
self.normalize = normalize
470+
self.standardize = standardize
392471
self.copy_X = copy_X
393472
self.n_jobs = n_jobs
394473

474+
@property
475+
@deprecated("The `normalize` attribute is not in use anymore "
476+
"from version 0.17 and will be removed in 0.19. If "
477+
"you want the data to be standardized instead, use "
478+
"`standardize=True`")
479+
def normalize(self):
480+
return None
481+
395482
def fit(self, X, y, sample_weight=None):
396483
"""
397484
Fit linear model.
@@ -416,11 +503,13 @@ def fit(self, X, y, sample_weight=None):
416503
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
417504
y_numeric=True, multi_output=True)
418505

419-
if ((sample_weight is not None) and np.atleast_1d(sample_weight).ndim > 1):
506+
if ((sample_weight is not None) and
507+
np.atleast_1d(sample_weight).ndim > 1):
420508
sample_weight = column_or_1d(sample_weight, warn=True)
421509

422510
X, y, X_mean, y_mean, X_std = self._center_data(
423 F438 -
X, y, self.fit_intercept, self.normalize, self.copy_X,
511+
X, y, fit_intercept=self.fit_intercept,
512+
standardize=self.standardize, copy=self.copy_X,
424513
sample_weight=sample_weight)
425514

426515
if sample_weight is not None:
@@ -450,24 +539,25 @@ def fit(self, X, y, sample_weight=None):
450539
return self
451540

452541

453-
def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
542+
def _pre_fit(X, y, Xy, precompute, standardize, fit_intercept, copy):
454543
"""Aux function used at beginning of fit in linear models"""
455544
n_samples, n_features = X.shape
456545

457546
if sparse.isspmatrix(X):
458547
precompute = False
459548
X, y, X_mean, y_mean, X_std = sparse_center_data(
460-
X, y, fit_intercept, normalize)
549+
X, y, fit_intercept=fit_intercept, standardize=standardize)
461550
else:
462551
# copy was done in fit if necessary
463552
X, y, X_mean, y_mean, X_std = center_data(
464-
X, y, fit_intercept, normalize, copy=copy)
553+
X, y, fit_intercept=fit_intercept, standardize=standardize,
554+
copy=copy)
465555
if hasattr(precompute, '__array__') and (
466-
fit_intercept and not np.allclose(X_mean, np.zeros(n_features))
467-
or normalize and not np.allclose(X_std, np.ones(n_features))):
556+
fit_intercept and not np.allclose(X_mean, np.zeros(n_features)) or
557+
standardize and not np.allclose(X_std, np.ones(n_features))):
468558
warnings.warn("Gram matrix was provided but X was centered"
469559
" to fit intercept, "
470-
"or X was normalized : recomputing Gram matrix.",
560+
"or X was standardized : recomputing Gram matrix.",
471561
UserWarning)
472562
# recompute Gram
473563
precompute = 'auto'

sklearn/linear_model/tests/test_base.py

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@
88

99
from sklearn.utils.testing import assert_array_almost_equal
1010
from sklearn.utils.testing import assert_equal
11+
from sklearn.utils.testing import assert_warns
12+
from sklearn.utils.testing import ignore_warnings
1113

1214
from sklearn.linear_model.base import LinearRegression
13-
from sklearn.linear_model.base import center_data, sparse_center_data, _rescale_data
15+
from sklearn.linear_model.base import center_data
16+
from sklearn.linear_model.base import sparse_center_data
17+
from sklearn.linear_model.base import _rescale_data
1418
from sklearn.utils import check_random_state
15-
from sklearn.utils.testing import assert_raise_message
1619
from sklearn.utils.testing import assert_greater
1720
from sklearn.datasets.samples_generator import make_sparse_uncorrelated
1821
from sklearn.datasets.samples_generator import make_regression
@@ -168,28 +171,35 @@ def test_center_data():
168171
X = rng.rand(n_samples, n_features)
169172
y = rng.rand(n_samples)
170173
expected_X_mean = np.mean(X, axis=0)
171-
# XXX: currently scaled to variance=n_samples
172-
expected_X_std = np.std(X, axis=0) * np.sqrt(X.shape[0])
174+
expected_X_std = np.std(X, axis=0)
173175
expected_y_mean = np.mean(y, axis=0)
174176

175177
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=False,
176-
normalize=False)
178+
standardize=False)
177179
assert_array_almost_equal(X_mean, np.zeros(n_features))
178180
assert_array_almost_equal(y_mean, 0)
179181
assert_array_almost_equal(X_std, np.ones(n_features))
180182
assert_array_almost_equal(Xt, X)
181183
assert_array_almost_equal(yt, y)
182184

183185
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
184-
normalize=False)
186+
standardize=False)
185187
assert_array_almost_equal(X_mean, expected_X_mean)
186188
assert_array_almost_equal(y_mean, expected_y_mean)
187189
assert_array_almost_equal(X_std, np.ones(n_features))
188190
assert_array_almost_equal(Xt, X - expected_X_mean)
189191
assert_array_almost_equal(yt, y - expected_y_mean)
190192

193+
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=False,
194+
standardize=True)
195+
assert_array_almost_equal(X_mean, np.zeros(n_features))
196+
assert_array_almost_equal(y_mean, 0)
197+
assert_array_almost_equal(X_std, np.ones(n_features))
198+
assert_array_almost_equal(Xt, X)
199+
assert_array_almost_equal(yt, y)
200+
191201
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
192-
normalize=True)
202+
standardize=True)
193203
assert_array_almost_equal(X_mean, expected_X_mean)
194204
assert_array_almost_equal(y_mean, expected_y_mean)
195205
assert_array_almost_equal(X_std, expected_X_std)
@@ -209,17 +219,17 @@ def test_center_data_multioutput():
209219
args = [(center_data, X), (sparse_center_data, sparse.csc_matrix(X))]
210220
for center, X in args:
211221
_, yt, _, y_mean, _ = center(X, y, fit_intercept=False,
212-
normalize=False)
222+
standardize=False)
213223
assert_array_almost_equal(y_mean, np.zeros(n_outputs))
214224
assert_array_almost_equal(yt, y)
215225

216226
_, yt, _, y_mean, _ = center(X, y, fit_intercept=True,
217-
normalize=False)
227+
standardize=False)
218228
assert_array_almost_equal(y_mean, expected_y_mean)
219229
assert_array_almost_equal(yt, y - y_mean)
220230

221231
_, yt, _, y_mean, _ = center(X, y, fit_intercept=True,
222-
normalize=True)
232+
standardize=True)
223233
assert_array_almost_equal(y_mean, expected_y_mean)
224234
assert_array_almost_equal(yt, y - y_mean)
225235

@@ -234,14 +244,12 @@ def test_center_data_weighted():
234244
expected_X_mean = np.average(X, axis=0, weights=sample_weight)
235245
expected_y_mean = np.average(y, axis=0, weights=sample_weight)
236246

237-
# XXX: if normalize=True, should we expect a weighted standard deviation?
247+
# XXX: if standardize=True, should we expect a weighted standard deviation?
238248
# Currently not weighted, but calculated with respect to weighted mean
239-
# XXX: currently scaled to variance=n_samples
240-
expected_X_std = (np.sqrt(X.shape[0]) *
241-
np.mean((X - expected_X_mean) ** 2, axis=0) ** .5)
249+
expected_X_std = (np.mean((X - expected_X_mean) ** 2, axis=0) ** .5)
242250

243251
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
244-
normalize=False,
252+
standardize=False,
245253
sample_weight=sample_weight)
246254
assert_array_almost_equal(X_mean, expected_X_mean)
247255
assert_array_almost_equal(y_mean, expected_y_mean)
@@ -250,7 +258,7 @@ def test_center_data_weighted():
250258
assert_array_almost_equal(yt, y - expected_y_mean)
251259

252260
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
253-
normalize=True,
261+
standardize=True,
254262
sample_weight=sample_weight)
255263
assert_array_almost_equal(X_mean, expected_X_mean)
256264
assert_array_almost_equal(y_mean, expected_y_mean)
@@ -268,12 +276,11 @@ def test_sparse_center_data():
268276
X = X.tolil()
269277
y = rng.rand(n_samples)
270278
XA = X.toarray()
271-
# XXX: currently scaled to variance=n_samples
272-
expected_X_std = np.std(XA, axis=0) * np.sqrt(X.shape[0])
279+
expected_X_std = np.std(XA, axis=0)
273280

274281
Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y,
275282
fit_intercept=False,
276-
normalize=False)
283+
standardize=False)
277284
assert_array_almost_equal(X_mean, np.zeros(n_features))
278285
assert_array_almost_equal(y_mean, 0)
279286
assert_array_almost_equal(X_std, np.ones(n_features))
@@ -282,7 +289,7 @@ def test_sparse_center_data():
282289

283290
Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y,
284291
fit_intercept=True,
285-
normalize=False)
292+
standardize=False)
286293
assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
287294
assert_array_almost_equal(y_mean, np.mean(y, axis=0))
288295
assert_array_almost_equal(X_std, np.ones(n_features))
@@ -291,7 +298,7 @@ def test_sparse_center_data():
291298

292299
Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y,
293300
fit_intercept=True,
294-
normalize=True)
301+
standardize=True)
295302
assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
296303
assert_array_almost_equal(y_mean, np.mean(y, axis=0))
297304
assert_array_almost_equal(X_std, expected_X_std)
@@ -322,3 +329,22 @@ def test_rescale_data():
322329
assert_array_almost_equal(rescaled_X, rescaled_X2)
323330
assert_array_almost_equal(rescaled_y, rescaled_y2)
324331

332+
333+
@ignore_warnings
334+
def test_normalize_deprecation():
335+
X = np.array([[1], [2]])
336+
y = np.array([1, 2])
337+
X_csr = sparse.csr_matrix(X)
338+
339+
assert_warns(DeprecationWarning, center_data, X, y, True, normalize=True)
340+
assert_warns(DeprecationWarning, sparse_center_data, X_csr, y, True,
341+
normalize=True)
342+
assert_warns(DeprecationWarning, LinearRegression, normalize=True)
343+
344+
# Test warning when accessing the normalize attribute
345+
est = LinearRegression(normalize=True)
346+
347+
def trigger_warning_for_property(estimator):
348+
est.normalize
349+
350+
assert_warns(DeprecationWarning, trigger_warning_for_property, est)

0 commit comments

Comments
 (0)
0