@@ -85,7 +85,7 @@ def sparse_center_data(X, y, fit_intercept, normalize=False):
85
85
else :
86
86
X = sp .csc_matrix (X , copy = normalize , dtype = np .float64 )
87
87
88
- X_mean , X_var = mean_variance_axis (X , axis = 0 )
88
+ X_offset , X_var = mean_variance_axis (X , axis = 0 )
89
89
if normalize :
90
90
# transform variance to std in-place
91
91
X_var *= X .shape [0 ]
@@ -95,14 +95,14 @@ def sparse_center_data(X, y, fit_intercept, normalize=False):
95
95
inplace_column_scale (X , 1. / X_std )
96
96
else :
97
97
X_std = np .ones (X .shape [1 ])
98
- y_mean = y .mean (axis = 0 )
99
- y = y - y_mean
98
+ y_offset = y .mean (axis = 0 )
99
+ y = y - y_offset
100
100
else :
101
- X_mean = np .zeros (X .shape [1 ])
101
+ X_offset = np .zeros (X .shape [1 ])
102
102
X_std = np .ones (X .shape [1 ])
103
- y_mean = 0. if y .ndim == 1 else np .zeros (y .shape [1 ], dtype = X .dtype )
103
+ y_offset = 0. if y .ndim == 1 else np .zeros (y .shape [1 ], dtype = X .dtype )
104
104
105
- return X , y , X_mean , y_mean , X_std
105
+ return X , y , X_offset , y_offset , X_std
106
106
107
107
108
108
@deprecated ("center_data will be removed in "
@@ -120,24 +120,25 @@ def center_data(X, y, fit_intercept, normalize=False, copy=True,
120
120
if isinstance (sample_weight , numbers .Number ):
121
121
sample_weight = None
122
122
if sp .issparse (X ):
123
- X_mean = np .zeros (X .shape [1 ])
123
+ X_offset = np .zeros (X .shape [1 ])
124
124
X_std = np .ones (X .shape [1 ])
125
125
else :
126
- X_mean = np .average (X , axis = 0 , weights = sample_weight )
127
- X -= X_mean
126
+ X_offset = np .average (X , axis = 0 , weights = sample_weight )
127
+ X -= X_offset
128
+ # XXX: currently scaled to variance=n_samples
128
129
if normalize :
129
130
X_std = np .sqrt (np .sum (X ** 2 , axis = 0 ))
130
131
X_std [X_std == 0 ] = 1
131
132
X /= X_std
132
133
else :
133
134
X_std = np .ones (X .shape [1 ])
134
- y_mean = np .average (y , axis = 0 , weights = sample_weight )
135
- y = y - y_mean
135
+ y_offset = np .average (y , axis = 0 , weights = sample_weight )
136
+ y = y - y_offset
136
137
else :
137
- X_mean = np .zeros (X .shape [1 ])
138
+ X_offset = np .zeros (X .shape [1 ])
138
139
X_std = np .ones (X .shape [1 ])
139
- y_mean = 0. if y .ndim == 1 else np .zeros (y .shape [1 ], dtype = X .dtype )
140
- return X , y , X_mean , y_mean , X_std
140
+ y_offset = 0. if y .ndim == 1 else np .zeros (y .shape [1 ], dtype = X .dtype )
141
+ return X , y , X_offset , y_offset , X_std
141
142
142
143
143
144
def _preprocess_data (X , y , fit_intercept , normalize = False , copy = True ,
@@ -150,10 +151,11 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
150
151
151
152
X = (X - X_offset) / X_scale
152
153
153
- If sample_weight is not None, then the weighted mean of X and y
154
- is zero, and not the mean itself. If return_mean=True, the mean, eventually
155
- weighted, is returned, independently of whether X was centered (option used
156
- for optimization with sparse data in coordinate_descend).
154
+ X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
155
+ then the weighted mean of X and y is zero, and not the mean itself. If
156
+ return_mean=True, the mean, eventually weighted, is returned, independently
157
+ of whether X was centered (option used for optimization with sparse data in
158
+ coordinate_descend).
157
159
158
160
This is here because nearly all linear models will want their data to be
159
161
centered.
@@ -210,11 +212,11 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
210
212
211
213
def _rescale_data (X , y , sample_weight ):
212
214
"""Rescale data so as to support sample_weight"""
213
- sample_weight = sample_weight * np .ones (y .shape [0 ])
215
+ n_samples = X .shape [0 ]
216
+ sample_weight = sample_weight * np .ones (n_samples )
214
217
sample_weight = np .sqrt (sample_weight )
215
- sw_matrix = np .diag (sample_weight )
216
- if sp .issparse (X ) or sp .issparse (y ):
217
- sw_matrix = sparse .dia_matrix (sw_matrix )
218
+ sw_matrix = sparse .dia_matrix ((sample_weight , 0 ),
219
+ shape = (n_samples , n_samples ))
218
220
X = safe_sparse_dot (sw_matrix , X )
219
221
y = safe_sparse_dot (sw_matrix , y )
220
222
return X , y
@@ -267,12 +269,12 @@ def predict(self, X):
267
269
268
270
_preprocess_data = staticmethod (_preprocess_data )
269
271
270
- def _set_intercept (self , X_mean , y_mean , X_norm ):
272
+ def _set_intercept (self , X_offset , y_offset , X_scale ):
271
273
"""Set the intercept_
272
274
"""
273
275
if self .fit_intercept :
274
- self .coef_ = self .coef_ / X_norm
275
- self .intercept_ = y_mean - np .dot (X_mean , self .coef_ .T )
276
+ self .coef_ = self .coef_ / X_scale
277
+ self .intercept_ = y_offset - np .dot (X_offset , self .coef_ .T )
276
278
else :
277
279
self .intercept_ = 0.
278
280
@@ -425,11 +427,11 @@ class LinearRegression(LinearModel, RegressorMixin):
425
427
426
428
normalize : boolean, optional, default False
427
429
If True, the regressors X will be normalized before regression.
428
- When the regressors are normalized, the fitted `coef_` are the same
429
- independently of the number of training samples; hence, hyperparameters
430
- learnt by cross-validation will be compatible among different training
431
- and validation sets . The same property is not valid for standardized
432
- data. However, if you wish to standardize, please use
430
+ This parameter is ignored when `fit_intercept` is set to `False`.
431
+ When the regressors are normalized, note that this makes the
432
+ hyperparameters learnt more robust and almost independent of the number
433
+ of samples . The same property is not valid for standardized data.
434
+ However, if you wish to standardize, please use
433
435
`preprocessing.StandardScaler` before calling `fit` on an estimator
434
436
with `normalize=False`.
435
437
@@ -510,7 +512,7 @@ def fit(self, X, y, sample_weight=None):
510
512
if sample_weight is not None and np .atleast_1d (sample_weight ).ndim > 1 :
511
513
raise ValueError ("Sample weights must be 1D array or scalar" )
512
514
513
- X , y , X_mean , y_mean , X_norm = self ._preprocess_data (
515
+ X , y , X_offset , y_offset , X_scale = self ._preprocess_data (
514
516
X , y , fit_intercept = self .fit_intercept , normalize = self .normalize ,
515
517
copy = self .copy_X , sample_weight = sample_weight )
516
518
@@ -537,7 +539,7 @@ def fit(self, X, y, sample_weight=None):
537
539
538
540
if y .ndim == 1 :
539
541
self .coef_ = np .ravel (self .coef_ )
540
- self ._set_intercept (X_mean , y_mean , X_norm )
542
+ self ._set_intercept (X_offset , y_offset , X_scale )
541
543
return self
542
544
543
545
@@ -547,16 +549,16 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
547
549
548
550
if sparse .isspmatrix (X ):
549
551
precompute = False
550
- X , y , X_mean , y_mean , X_norm = _preprocess_data (
552
+ X , y , X_offset , y_offset , X_scale = _preprocess_data (
551
553
X , y , fit_intercept = fit_intercept , normalize = normalize ,
552
554
return_mean = True )
553
555
else :
554
556
# copy was done in fit if necessary
555
- X , y , X_mean , y_mean , X_norm = _preprocess_data (
557
+ X , y , X_offset , y_offset , X_scale = _preprocess_data (
556
558
X , y , fit_intercept = fit_intercept , normalize = normalize , copy = copy )
557
559
if hasattr (precompute , '__array__' ) and (
558
- fit_intercept and not np .allclose (X_mean , np .zeros (n_features )) or
559
- normalize and not np .allclose (X_norm , np .ones (n_features ))):
560
+ fit_intercept and not np .allclose (X_offset , np .zeros (n_features )) or
561
+ normalize and not np .allclose (X_scale , np .ones (n_features ))):
560
562
warnings .warn ("Gram matrix was provided but X was centered"
561
563
" to fit intercept, "
562
564
"or X was normalized : recomputing Gram matrix." ,
@@ -593,4 +595,4 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
593
595
order = 'F' )
594
596
np .dot (y .T , X , out = Xy .T )
595
597
596
- return X , y , X_mean , y_mean , X_norm , precompute , Xy
598
+ return X , y , X_offset , y_offset , X_scale , precompute , Xy
0 commit comments