8000 normalized param deprecated + test fixed · scikit-learn/scikit-learn@492c5cb · GitHub
[go: up one dir, main page]

Skip to content
Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 492c5cb

Browse files
author
giorgiop
committed
normalized param deprecated + test fixed
1 parent e8d2b93 commit 492c5cb

File tree

2 files changed

+89
-53
lines changed

2 files changed

+89
-53
lines changed

sklearn/linear_model/base.py

Lines changed: 55 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,10 @@
3434
from ..utils.seq_dataset import ArrayDataset, CSRDataset
3535

3636

37-
###
38-
### TODO: intercept for all models
39-
### We should define a common function to center data instead of
40-
### repeating the same code inside each fit method.
37+
# TODO: intercept for all models
4138

42-
### TODO: bayesian_ridge_regression and bayesian_regression_ard
43-
### should be squashed into its respective objects.
39+
# TODO: bayesian_ridge_regression and bayesian_regression_ard
40+
# should be squashed into its respective objects.
4441

4542
SPARSE_INTERCEPT_DECAY = 0.01
4643
# For sparse data intercept updates are scaled by this decay factor to avoid
@@ -69,27 +66,32 @@ def make_dataset(X, y, sample_weight, random_state=None):
6966
return dataset, intercept_decay
7067

7168

72-
def sparse_center_data(X, y, fit_intercept, normalize=False):
69+
def sparse_center_data(X, y, fit_intercept, standardize=False,
70+
normalize=None):
7371
"""
7472
Compute information needed to center data to have mean zero along
7573
axis 0. Be aware that X will not be centered since it would break
76-
the sparsity, but will be normalized if asked so.
74+
the sparsity, but will be standardized if asked so.
7775
"""
76+
if normalize is not None:
77+
warnings.warn("The `normalize` parameter is not in use anymore from "
78+
"version 0.17 and will be removed in 0.19. If you want "
79+
"to standardize the data instead, use"
80+
"`standardize=True`", DeprecationWarning)
81+
7882
if fit_intercept:
7983
# we might require not to change the csr matrix sometimes
80-
# store a copy if normalize is True.
84+
# store a copy if standardize is True.
8185
# Change dtype to float64 since mean_variance_axis accepts
8286
# it that way.
8387
if sp.isspmatrix(X) and X.getformat() == 'csr':
84-
X = sp.csr_matrix(X, copy=normalize, dtype=np.float64)
88+
X = sp.csr_matrix(X, copy=standardize, dtype=np.float64)
8589
else:
86-
X = sp.csc_matrix(X, copy=normalize, dtype=np.float64)
90+
X = sp.csc_matrix(X, copy=standardize, dtype=np.float64)
8791

8892
X_mean, X_var = mean_variance_axis(X, axis=0)
89-
if normalize:
93+
if standardize:
9094
# transform variance to std in-place
91-
# XXX: currently scaled to variance=n_samples to match center_data
92-
X_var *= X.shape[0]
9395
X_std = np.sqrt(X_var, X_var)
9496
del X_var
9597
X_std[X_std == 0] = 1
@@ -106,15 +108,21 @@ def sparse_center_data(X, y, fit_intercept, normalize=False):
106108
return X, y, X_mean, y_mean, X_std
107109

108110

109-
def center_data(X, y, fit_intercept, normalize=False, copy=True,
110-
sample_weight=None):
111+
def center_data(X, y, fit_intercept, standardize=False, normalize=None,
112+
copy=True, sample_weight=None):
111113
"""
112114
Centers data to have mean zero along axis 0. This is here because
113115
nearly all linear models will want their data to be centered.
114116
115117
If sample_weight is not None, then the weighted mean of X and y
116118
is zero, and not the mean itself
117119
"""
120+
if normalize is not None:
121+
warnings.warn("The `normalize` parameter is not in use anymore from "
122+
"version 0.17 and will be removed in 0.19. If you want "
123+
"to standardize the data instead, use"
124+
"`standardize=True`", DeprecationWarning)
125+
118126
X = as_float_array(X, copy)
119127
if fit_intercept:
120128
if isinstance(sample_weight, numbers.Number):
@@ -125,9 +133,8 @@ def center_data(X, y, fit_intercept, normalize=False, copy=True,
125133
else:
126134
X_mean = np.average(X, axis=0, weights=sample_weight)
127135
X -= X_mean
128-
if normalize:
129-
# XXX: currently scaled to variance=n_samples
130-
X_std = np.sqrt(np.sum(X ** 2, axis=0))
136+
if standardize:
137+
X_std = np.sqrt(np.mean(X ** 2, axis=0))
131138
X_std[X_std == 0] = 1
132139
X /= X_std
133140
else:
@@ -356,8 +363,8 @@ class LinearRegression(LinearModel, RegressorMixin):
356363
to false, no intercept will be used in calculations
357364
(e.g. data is expected to be already centered).
358365
359-
normalize : boolean, optional, default False
360-
If True, the regressors X will be normalized before regression.
366+
standardize : boolean, optional, default False
367+
If True, the regressors X will be standardized before regression.
361368
362369
copy_X : boolean, optional, default True
363370
If True, X will be copied; else, it may be overwritten.
@@ -385,13 +392,26 @@ class LinearRegression(LinearModel, RegressorMixin):
385392
386393
"""
387394

388-
def __init__(self, fit_intercept=True, normalize=False, copy_X=True,
389-
n_jobs=1):
395+
def __init__(self, fit_intercept=True, standardize=False, normalize=None,
396+
copy_X=True, n_jobs=1):
397+
if normalize is not None:
398+
warnings.warn("The `normalize` parameter is not in use anymore "
399+
"from version 0.17 and will be removed in 0.19. If "
400+
"you want the data to be standardized instead, use "
401+
"`standardize=True`", DeprecationWarning)
390402
self.fit_intercept = fit_intercept
391-
self.normalize = normalize
403+
self.standardize = standardize
392404
self.copy_X = copy_X
393405
self.n_jobs = n_jobs
394406

407+
@property
408+
@deprecated("The `normalize` attribute is not in use anymore "
409+
"from version 0.17 and will be removed in 0.19. If "
410+
"you want the data to be standardized instead, use "
411+
"`standardize=True`")
412+
def normalize(self):
413+
return None
414+
395415
def fit(self, X, y, sample_weight=None):
396416
"""
397417
Fit linear model.
@@ -416,11 +436,13 @@ def fit(self, X, y, sample_weight=None):
416436
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
417437
y_numeric=True, multi_output=True)
418438

419-
if ((sample_weight is not None) and np.atleast_1d(sample_weight).ndim > 1):
439+
if ((sample_weight is not None) and
440+
np.atleast_1d(sample_weight).ndim > 1):
420441
sample_weight = column_or_1d(sample_weight, warn=True)
421442

422443
X, y, X_mean, y_mean, X_std = self._center_data(
423-
X, y, self.fit_intercept, self.normalize, self.copy_X,
444+
X, y, fit_intercept=self.fit_intercept,
445+
standardize=self.standardize, copy=self.copy_X,
424446
sample_weight=sample_weight)
425447

426448
if sample_weight is not None:
@@ -450,24 +472,25 @@ def fit(self, X, y, sample_weight=None):
450472
return self
451473

452474

453-
def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
475+
def _pre_fit(X, y, Xy, precompute, standardize, fit_intercept, copy):
454476
"""Aux function used at beginning of fit in linear models"""
455477
n_samples, n_features = X.shape
456478

457479
if sparse.isspmatrix(X):
458480
precompute = False
459481
X, y, X_mean, y_mean, X_std = sparse_center_data(
460-
X, y, fit_intercept, normalize)
482+
X, y, fit_intercept=fit_intercept, standardize=standardize)
461483
else:
462484
# copy was done in fit if necessary
463485
X, y, X_mean, y_mean, X_std = center_data(
464-
X, y, fit_intercept, normalize, copy=copy)
486+
X, y, fit_intercept=fit_intercept, standardize=standardize,
487+
copy=copy)
465488
if hasattr(precompute, '__array__') and (
466-
fit_intercept and not np.allclose(X_mean, np.zeros(n_features))
467-
or normalize and not np.allclose(X_std, np.ones(n_features))):
489+
fit_intercept and not np.allclose(X_mean, np.zeros(n_features)) or
490+
standardize and not np.allclose(X_std, np.ones(n_features))):
468491
warnings.warn("Gram matrix was provided but X was centered"
469492
" to fit intercept, "
470-
"or X was normalized : recomputing Gram matrix.",
493+
"or X was standardized : recomputing Gram matrix.",
471494
UserWarning)
472495
# recompute Gram
473496
precompute = 'auto'

sklearn/linear_model/tests/test_base.py

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@
88

99
from sklearn.utils.testing import assert_array_almost_equal
1010
from sklearn.utils.testing import assert_equal
11+
from sklearn.utils.testing import assert_warns
12+
from sklearn.utils.testing import ignore_warnings
1113

1214
from sklearn.linear_model.base import LinearRegression
13-
from sklearn.linear_model.base import center_data, sparse_center_data, _rescale_data
15+
from sklearn.linear_model.base import center_data
16+
from sklearn.linear_model.base import sparse_center_data
17+
from sklearn.linear_model.base import _rescale_data
1418
from sklearn.utils import check_random_state
15-
from sklearn.utils.testing import assert_raise_message
1619
from sklearn.utils.testing import assert_greater
1720
from sklearn.datasets.samples_generator import make_sparse_uncorrelated
1821
from sklearn.datasets.samples_generator import make_regression
@@ -168,28 +171,27 @@ def test_center_data():
168171
X = rng.rand(n_samples, n_features)
169172
y = rng.rand(n_samples)
170173
expected_X_mean = np.mean(X, axis=0)
171-
# XXX: currently scaled to variance=n_samples
172-
expected_X_std = np.std(X, axis=0) * np.sqrt(X.shape[0])
174+
expected_X_std = np.std(X, axis=0)
173175
expected_y_mean = np.mean(y, axis=0)
174176

175177
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=False,
176-
normalize=False)
178+
standardize=False)
177179
assert_array_almost_equal(X_mean, np.zeros(n_features))
178180
assert_array_almost_equal(y_mean, 0)
179181
assert_array_almost_equal(X_std, np.ones(n_features))
180182
assert_array_almost_equal(Xt, X)
181183
assert_array_almost_equal(yt, y)
182184

183185
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
184-
normalize=False)
186+
standardize=False)
185187
assert_array_almost_equal(X_mean, expected_X_mean)
186188
assert_array_almost_equal(y_mean, expected_y_mean)
187189
assert_array_almost_equal(X_std, np.ones(n_features))
188190
assert_array_almost_equal(Xt, X - expected_X_mean)
189191
assert_array_almost_equal(yt, y - expected_y_mean)
190192

191193
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
192-
normalize=True)
194+
standardize=True)
193195
assert_array_almost_equal(X_mean, expected_X_mean)
194196
assert_array_almost_equal(y_mean, expected_y_mean)
195197
assert_array_almost_equal(X_std, expected_X_std)
@@ -209,17 +211,17 @@ def test_center_data_multioutput():
209211
args = [(center_data, X), (sparse_center_data, sparse.csc_matrix(X))]
210212
for center, X in args:
211213
_, yt, _, y_mean, _ = center(X, y, fit_intercept=False,
212-
normalize=False)
214+
standardize=False)
213215
assert_array_almost_equal(y_mean, np.zeros(n_outputs))
214216
assert_array_almost_equal(yt, y)
215217

216218
_, yt, _, y_mean, _ = center(X, y, fit_intercept=True,
217-
normalize=False)
219+
standardize=False)
218220
assert_array_almost_equal(y_mean, expected_y_mean)
219221
assert_array_almost_equal(yt, y - y_mean)
220222

221223
_, yt, _, y_mean, _ = center(X, y, fit_intercept=True,
222-
normalize=True)
224+
standardize=True)
223225
assert_array_almost_equal(y_mean, expected_y_mean)
224226
assert_array_almost_equal(yt, y - y_mean)
225227

@@ -234,14 +236,12 @@ def test_center_data_weighted():
234236
expected_X_mean = np.average(X, axis=0, weights=sample_weight)
235237
expected_y_mean = np.average(y, axis=0, weights=sample_weight)
236238

237-
# XXX: if normalize=True, should we expect a weighted standard deviation?
239+
# XXX: if standardize=True, should we expect a weighted standard deviation?
238240
# Currently not weighted, but calculated with respect to weighted mean
239-
# XXX: currently scaled to variance=n_samples
240-
expected_X_std = (np.sqrt(X.shape[0]) *
241-
np.mean((X - expected_X_mean) ** 2, axis=0) ** .5)
241+
expected_X_std = (np.mean((X - expected_X_mean) ** 2, axis=0) ** .5)
242242

243243
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
244-
normalize=False,
244+
standardize=False,
245245
sample_weight=sample_weight)
246246
assert_array_almost_equal(X_mean, expected_X_mean)
247247
assert_array_almost_equal(y_mean, expected_y_mean)
@@ -250,7 +250,7 @@ def test_center_data_weighted():
250250
assert_array_almost_equal(yt, y - expected_y_mean)
251251

252252
Xt, yt, X_mean, y_mean, X_std = center_data(X, y, fit_intercept=True,
253-
normalize=True,
253+
standardize=True,
254254
sample_weight=sample_weight)
255255
assert_array_almost_equal(X_mean, expected_X_mean)
256256
assert_array_almost_equal(y_mean, expected_y_mean)
@@ -268,12 +268,11 @@ def test_sparse_center_data():
268268
X = X.tolil()
269269
y = rng.rand(n_samples)
270270
XA = X.toarray()
271-
# XXX: currently scaled to variance=n_samples
272-
expected_X_std = np.std(XA, axis=0) * np.sqrt(X.shape[0])
271+
expected_X_std = np.std(XA, axis=0)
273272

274273
Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y,
275274
fit_intercept=False,
276-
normalize=False)
275+
standardize=False)
277276
assert_array_almost_equal(X_mean, np.zeros(n_features))
278277
assert_array_almost_equal(y_mean, 0)
279278
assert_array_almost_equal(X_std, np.ones(n_features))
@@ -282,7 +281,7 @@ def test_sparse_center_data():
282281

283282
Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y,
284283
fit_intercept=True,
285-
normalize=False)
284+
standardize=False)
286285
assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
287286
assert_array_almost_equal(y_mean, np.mean(y, axis=0))
288287
assert_array_almost_equal(X_std, np.ones(n_features))
@@ -291,7 +290,7 @@ def test_sparse_center_data():
291290

292291
Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y,
293292
fit_intercept=True,
294-
normalize=True)
293+
standardize=True)
295294
assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
296295
assert_array_almost_equal(y_mean, np.mean(y, axis=0))
297296
assert_array_almost_equal(X_std, expected_X_std)
@@ -322,3 +321,17 @@ def test_rescale_data():
322321
assert_array_almost_equal(rescaled_X, rescaled_X2)
323322
assert_array_almost_equal(rescaled_y, rescaled_y2)
324323

324+
325+
@ignore_warnings
326+
def test_normalize_deprecation():
327+
X = np.array([[1], [2]])
328+
y = np.array([1, 2])
329+
X_csr = sparse.csr_matrix(X)
330+
# est = LinearRegression(normalize=True)
331+
332+
assert_warns(DeprecationWarning, center_data, X, y, True, normalize=True)
333+
assert_warns(DeprecationWarning, sparse_center_data, X_csr, y, True,
334+
normalize=True)
335+
assert_warns(DeprecationWarning, LinearRegression, normalize=True)
336+
# TODO
337+
# assert_warns(DeprecationWarning, est.normalize)

0 commit comments

Comments
 (0)
0