10
10
# Mathieu Blondel <mathieu@mblondel.org>
11
11
# Lars Buitinck <L.J.Buitinck@uva.nl>
12
12
# Maryan Morel <maryan.morel@polytechnique.edu>
13
- #
13
+ # Giorgio Patrini <giorgio.patrini@anu.edu.au>
14
14
# License: BSD 3 clause
15
15
16
16
from __future__ import division
26
26
from ..externals import six
27
27
from ..externals .joblib import Parallel , delayed
28
28
from ..base import BaseEstimator , ClassifierMixin , RegressorMixin
29
- from ..utils import as_float_array , check_array , check_X_y , deprecated
30
- from ..utils import check_random_state , column_or_1d
29
+ from ..utils import check_array , check_X_y , deprecated , as_float_array
30
+ from ..utils .validation import FLOAT_DTYPES
31
+ from ..utils import check_random_state
31
32
from ..utils .extmath import safe_sparse_dot
32
33
from ..utils .sparsefuncs import mean_variance_axis , inplace_column_scale
33
34
from ..utils .fixes import sparse_lsqr
34
35
from ..utils .seq_dataset import ArrayDataset , CSRDataset
35
36
from ..utils .validation import check_is_fitted
36
37
from ..exceptions import NotFittedError
37
-
38
-
39
- #
40
- # TODO: intercept for all models
41
- # We should define a common function to center data instead of
42
- # repeating the same code inside each fit method.
38
+ from ..preprocessing .data import normalize as f_normalize
43
39
44
40
# TODO: bayesian_ridge_regression and bayesian_regression_ard
45
41
# should be squashed into its respective objects.
@@ -71,6 +67,8 @@ def make_dataset(X, y, sample_weight, random_state=None):
71
67
return dataset , intercept_decay
72
68
73
69
70
+ @deprecated ("sparse_center_data will be removed in "
71
+ "0.20. Use utilities in preprocessing.data instead" )
74
72
def sparse_center_data (X , y , fit_intercept , normalize = False ):
75
73
"""
76
74
Compute information needed to center data to have mean zero along
@@ -87,33 +85,33 @@ def sparse_center_data(X, y, fit_intercept, normalize=False):
87
85
else :
88
86
X = sp .csc_matrix (X , copy = normalize , dtype = np .float64 )
89
87
90
- X_mean , X_var = mean_variance_axis (X , axis = 0 )
88
+ X_offset , X_var = mean_variance_axis (X , axis = 0 )
91
89
if normalize :
92
90
# transform variance to std in-place
93
- # XXX: currently scaled to variance=n_samples to match center_data
94
91
X_var *= X .shape[0 ]
95
92
X_std = np .sqrt (X_var , X_var )
96
93
del X_var
97
94
X_std [X_std == 0 ] = 1
98
95
inplace_column_scale (X , 1. / X_std )
99
96
else :
100
97
X_std = np .ones (X .shape [1 ])
101
- y_mean = y .mean (axis = 0 )
102
- y = y - y_mean
98
+ y_offset = y .mean (axis = 0 )
99
+ y = y - y_offset
103
100
else :
104
- X_mean = np .zeros (X .shape [1 ])
101
+ X_offset = np .zeros (X .shape [1 ])
105
102
X_std = np .ones (X .shape [1 ])
106
- y_mean = 0. if y .ndim == 1 else np .zeros (y .shape [1 ], dtype = X .dtype )
103
+ y_offset = 0. if y .ndim == 1 else np .zeros (y .shape [1 ], dtype = X .dtype )
107
104
108
- return X , y , X_mean , y_mean , X_std
105
+ return X , y , X_offset , y_offset , X_std
109
106
110
107
108
+ @deprecated ("center_data will be removed in "
109
+ "0.20. Use utilities in preprocessing.data instead" )
111
110
def center_data (X , y , fit_intercept , normalize = False , copy = True ,
112
111
sample_weight = None ):
113
112
"""
114
113
Centers data to have mean zero along axis 0. This is here because
115
114
nearly all linear models will want their data to be centered.
116
-
117
115
If sample_weight is not None, then the weighted mean of X and y
118
116
is zero, and not the mean itself
119
117
"""
@@ -122,26 +120,95 @@ def center_data(X, y, fit_intercept, normalize=False, copy=True,
122
120
if isinstance (sample_weight , numbers .Number ):
123
121
sample_weight = None
124
122
if sp .issparse (X ):
125
- X_mean = np .zeros (X .shape [1 ])
123
+ X_offset = np .zeros (X .shape [1 ])
126
124
X_std = np .ones (X .shape [1 ])
127
125
else :
128
- X_mean = np .average (X , axis = 0 , weights = sample_weight )
129
- X -= X_mean
126
+ X_offset = np .average (X , axis = 0 , weights = sample_weight )
127
+ X -= X_offset
128
+ # XXX: currently scaled to variance=n_samples
130
129
if normalize :
131
- # XXX: currently scaled to variance=n_samples
132
130
X_std = np .sqrt (np .sum (X ** 2 , axis = 0 ))
133
131
X_std [X_std == 0 ] = 1
134
132
X /= X_std
135
133
else :
136
134
X_std = np .ones (X .shape [1 ])
137
- y_mean = np .average (y , axis = 0 , weights = sample_weight )
138
- y = y - y_mean
135
+ y_offset = np .average (y , axis = 0 , weights = sample_weight )
136
+ y = y - y_offset
139
137
else :
140
- X_mean = np .zeros (X .shape [1 ])
138
+ X_offset = np .zeros (X .shape [1 ])
141
139
X_std = np .ones (X .shape [1 ])
142
- y_mean = 0. if y .ndim == 1 else np .zeros (y .shape [1 ], dtype = X .dtype )
143
- return X , y , X_mean , y_mean , X_std
140
+ y_offset = 0. if y .ndim == 1 else np .zeros (y .shape [1 ], dtype = X .dtype )
141
+ return X , y , X_offset , y_offset , X_std
142
+
143
+
144
+ def _preprocess_data (X , y , fit_intercept , normalize = False , copy = True ,
145
+ sample_weight = None , return_mean = False ):
146
+ """
147
+ Centers data to have mean zero along axis 0. If fit_intercept=False or if
148
+ the X is a sparse matrix, no centering is done, but normalization can still
149
+ be applied. The function returns the statistics necessary to reconstruct
150
+ the input data, which are X_offset, y_offset, X_scale, such that the output
151
+
152
+ X = (X - X_offset) / X_scale
153
+
154
+ X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
155
+ then the weighted mean of X and y is zero, and not the mean itself. If
156
+ return_mean=True, the mean, eventually weighted, is returned, independently
157
+ of whether X was centered (option used for optimization with sparse data in
158
+ coordinate_descend).
159
+
160
+ This is here because nearly all linear models will want their data to be
161
+ centered.
162
+ """
163
+
164
+ if isinstance (sample_weight , numbers .Number ):
165
+ sample_weight = None
166
+
167
+ X = check_array (X , copy = copy , accept_sparse = ['csr' , 'csc' ],
168
+ dtype = FLOAT_DTYPES )
169
+
170
+ if fit_intercept :
171
+ if sp .issparse (X ):
172
+ X_offset , X_var = mean_variance_axis (X , axis = 0 )
173
+ if not return_mean :
174
+ X_offset = np .zeros (X .shape [1 ])
175
+
176
+ if normalize :
177
+
178
+ # TODO: f_normalize could be used here as well but the function
179
+ # inplace_csr_row_normalize_l2 must be changed such that it
180
+ # can return also the norms computed internally
181
+
182
+ # transform variance to norm in-place
183
+ X_var *= X .shape [0 ]
184
+ X_scale = np .sqrt (X_var , X_var )
185
+ del X_var
186
+ X_scale [X_scale == 0 ] = 1
187
+ inplace_column_scale (X , 1. / X_scale )
188
+ else :
189
+ X_scale = np .ones (X .shape [1 ])
190
+
191
+ else :
192
+ X_offset = np .average (X , axis = 0 , weights = sample_weight )
193
+ X -= X_offset
194
+ if normalize :
195
+ X , X_scale = f_normalize (X , axis = 0 , copy = False ,
196
+ return_norm = True )
197
+ else :
198
+ X_scale = np .ones (X .shape [1 ])
199
+ y_offset = np .average (y , axis = 0 , weights = sample_weight )
200
+ y = y - y_offset
201
+ else :
202
+ X_offset = np .zeros (X .shape [1 ])
203
+ X_scale = np .ones (X .shape [1 ])
204
+ y_offset = 0. if y .ndim == 1 else np .zeros (y .shape [1 ], dtype = X .dtype )
205
+
206
+ return X , y , X_offset , y_offset , X_scale
207
+
144
208
209
+ # TODO: _rescale_data should be factored into _preprocess_data.
210
+ # Currently, the fact that sag implements its own way to deal with
211
+ # sample_weight makes the refactoring tricky.
145
212
146
213
def _rescale_data (X , y , sample_weight ):
147
214
"""Rescale data so as to support sample_weight"""
@@ -200,14 +267,14 @@ def predict(self, X):
200
267
"""
201
268
return self ._decision_function (X )
202
269
203
- _center_data = staticmethod (center_data )
270
+ _preprocess_data = staticmethod (_preprocess_data )
204
271
205
- def _set_intercept (self , X_mean , y_mean , X_std ):
272
+ def _set_intercept (self , X_offset , y_offset , X_scale ):
206
273
"""Set the intercept_
207
274
"""
208
275
if self .fit_intercept :
209
- self .coef_ = self .coef_ / X_std
210
- self .intercept_ = y_mean - np .dot (X_mean , self .coef_ .T )
276
+ self .coef_ = self .coef_ / X_scale
277
+ self .intercept_ = y_offset - np .dot (X_offset , self .coef_ .T )
211
278
else :
212
279
self .intercept_ = 0.
213
280
@@ -360,6 +427,13 @@ class LinearRegression(LinearModel, RegressorMixin):
360
427
361
428
normalize : boolean, optional, default False
362
429
If True, the regressors X will be normalized before regression.
430
+ This parameter is ignored when `fit_intercept` is set to False.
431
+ When the regressors are normalized, note that this makes the
432
+ hyperparameters learnt more robust and almost independent of the number
433
+ of samples. The same property is not valid for standardized data.
434
+ However, if you wish to standardize, please use
435
+ `preprocessing.StandardScaler` before calling `fit` on an estimator
436
+ with `normalize=False`.
363
437
364
438
copy_X : boolean, optional, default True
365
439
If True, X will be copied; else, it may be overwritten.
@@ -435,13 +509,12 @@ def fit(self, X, y, sample_weight=None):
435
509
X , y = check_X_y (X , y , accept_sparse = ['csr' , 'csc' , 'coo' ],
436
510
y_numeric = True , multi_output = True )
437
511
438
- if ((sample_weight is not None ) and np .atleast_1d (
439
- sample_weight ).ndim > 1 ):
440
- sample_weight = column_or_1d (sample_weight , warn = True )
512
+ if sample_weight is not None and np .atleast_1d (sample_weight ).ndim > 1 :
513
+ raise ValueError ("Sample weights must be 1D array or scalar" )
441
514
442
- X , y , X_mean , y_mean , X_std = self ._center_data (
443
- X , y , self .fit_intercept , self . normalize , self .copy_X ,
444
- sample_weight = sample_weight )
515
+ X , y , X_offset , y_offset , X_scale = self ._preprocess_data (
516
+ X , y , fit_intercept = self .fit_intercept , normalize = self .normalize ,
517
+ copy = self . copy_X , sample_weight = sample_weight )
445
518
446
519
if sample_weight is not None :
447
520
# Sample weight can be implemented via a simple rescaling.
@@ -466,7 +539,7 @@ def fit(self, X, y, sample_weight=None):
466
539
467
540
if y .ndim == 1 :
468
541
self .coef_ = np .ravel (self .coef_ )
469
- self ._set_intercept (X_mean , y_mean , X_std )
542
+ self ._set_intercept (X_offset , y_offset , X_scale )
470
543
return self
471
544
472
545
@@ -476,15 +549,16 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
476
549
477
550
if sparse .isspmatrix (X ):
478
551
precompute = False
479
- X , y , X_mean , y_mean , X_std = sparse_center_data (
480
- X , y , fit_intercept , normalize )
552
+ X , y , X_offset , y_offset , X_scale = _preprocess_data (
553
+ X , y , fit_intercept = fit_intercept , normalize = normalize ,
554
+ return_mean = True )
481
555
else :
482
556
# copy was done in fit if necessary
483
- X , y , X_mean , y_mean , X_std = center_data (
484
- X , y , fit_intercept , normalize , copy = copy )
557
+ X , y , X_offset , y_offset , X_scale = _preprocess_data (
558
+ X , y , fit_intercept = fit_intercept , normalize = normalize , copy = copy )
485
559
if hasattr (precompute , '__array__' ) and (
486
- fit_intercept and n
8F1E
ot np .allclose (X_mean , np .zeros (n_features ))
487
- or normalize and not np .allclose (X_std , np .ones (n_features ))):
560
+ fit_intercept and not np .allclose (X_offset , np .zeros (n_features )) or
561
+ normalize and not np .allclose (X_scale , np .ones (n_features ))):
488
562
warnings .warn ("Gram matrix was provided but X was centered"
489
563
" to fit intercept, "
490
564
"or X was normalized : recomputing Gram matrix." ,
@@ -521,4 +595,4 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
521
595
order = 'F' )
522
596
np .dot (y .T , X , out = Xy .T )
523
597
524
- return X , y , X_mean , y_mean , X_std , precompute , Xy
598
+ return X , y , X_offset , y_offset , X_scale , precompute , Xy
0 commit comments