scikit-learn
diff --git a/‎doc/datasets/rcv1.rst
Lines changed: 1 addition & 1 deletion b/‎doc/datasets/rcv1.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎sklearn/linear_model/base.py
Lines changed: 109 additions & 42 deletions b/‎sklearn/linear_model/base.py
Lines changed: 109 additions & 42 deletions
diff --git a/‎sklearn/linear_model/bayes.py
Lines changed: 16 additions & 8 deletions b/‎sklearn/linear_model/bayes.py
Lines changed: 16 additions & 8 deletions
@@ -35,7 +35,7 @@ Each sample can be identified by its ID, ranging (with gaps) from 2286 to 810596
     array([2286, 2287, 2288], dtype=int32)
 
 ``target_names``:
-The target values are the topics of each sample. Each sample
EDBE
 belongs to at least one topic, and to up to 17 topics. 
+The target values are the topics of each sample. Each sample belongs to at least one topic, and to up to 17 topics.
 There are 103 topics, each represented by a string. Their corpus frequencies span five orders of magnitude, from 5 occurrences for 'GMIL', to 381327 for 'CCAT'::
 
     >>> rcv1.target_names[:3].tolist()  # doctest: +SKIP
 
@@ -10,7 +10,7 @@
 #         Mathieu Blondel <mathieu@mblondel.org>
 #         Lars Buitinck <L.J.Buitinck@uva.nl>
 #         Maryan Morel <maryan.morel@polytechnique.edu>
-#
+#         Giorgio Patrini <giorgio.patrini@anu.edu.au>
 # License: BSD 3 clause
 
 from __future__ import division
@@ -26,19 +26,16 @@
 from ..externals import six
 from ..externals.joblib import Parallel, delayed
 from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
-from ..utils import as_float_array, check_array, check_X_y, deprecated
+from ..utils import check_array, check_X_y, deprecated, as_float_array
+from ..utils.validation import FLOAT_DTYPES
 from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
 from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
 from ..utils.fixes import sparse_lsqr
 from ..utils.seq_dataset import ArrayDataset, CSRDataset
 from ..utils.validation import check_is_fitted
 from ..exceptions import NotFittedError
-
-#
-# TODO: intercept for all models
-# We should define a common function to center data instead of
-# repeating the same code inside each fit method.
+from ..preprocessing.data import normalize as f_normalize
 
 # TODO: bayesian_ridge_regression and bayesian_regression_ard
 # should be squashed into its respective objects.
@@ -70,49 +67,51 @@ def make_dataset(X, y, sample_weight, random_state=None):
     return dataset, intercept_decay
 
 
+@deprecated("sparse_center_data will be removed in "
+            "0.20. Use utilities in preprocessing.data instead")
 def sparse_center_data(X, y, fit_intercept, normalize=False):
     """
     Compute information needed to center data to have mean zero along
     axis 0. Be aware that X will not be centered since it would break
     the sparsity, but will be normalized if asked so.
     """
-    # We might require not to change the csr matrix sometimes
-    # Store a copy if normalize is True.
-    # Change dtype to float64 since mean_variance_axis accepts
-    # it that way.
     if fit_intercept:
+        # we might require not to change the csr matrix sometimes
+        # store a copy if normalize is True.
+        # Change dtype to float64 since mean_variance_axis accepts
+        # it that way.
         if sp.isspmatrix(X) and X.getformat() == 'csr':
             X = sp.csr_matrix(X, copy=normalize, dtype=np.float64)
         else:
             X = sp.csc_matrix(X, copy=normalize, dtype=np.float64)
 
         X_mean, X_var = mean_variance_axis(X, axis=0)
         if normalize:
-            # transform variance to norm in-place
-            # XXX: currently scaled to variance=n_samples to match center_data
+            # transform variance to std in-place
             X_var *= X.shape[0]
-            X_norm = np.sqrt(X_var, X_var)
+            X_std = np.sqrt(X_var, X_var)
             del X_var
-            X_norm[X_norm == 0] = 1
-            inplace_column_scale(X, 1. / X_norm)
+            X_std[X_std == 0] = 1
+            inplace_column_scale(X, 1. / X_std)
         else:
-            X_norm = np.ones(X.shape[1])
+            X_std = np.ones(X.shape[1])
         y_mean = y.mean(axis=0)
         y = y - y_mean
     else:
         X_mean = np.zeros(X.shape[1])
-        X_norm = np.ones(X.shape[1])
+        X_std = np.ones(X.shape[1])
         y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
 
-    return X, y, X_mean, y_mean, X_norm
+    return X, y, X_mean, y_mean, X_std
 
 
+@deprecated("center_data will be removed in "
+            "0.20. Use utilities in preprocessing.data instead")
 def center_data(X, y, fit_intercept, normalize=False, copy=True,
                 sample_weight=None):
     """
     Centers data to have mean zero along axis 0. This is here because
     nearly all linear models will want their data to be centered.
-
     If sample_weight is not None, then the weighted mean of X and y
     is zero, and not the mean itself
     """
@@ -122,33 +121,97 @@ def center_data(X, y, fit_intercept, normalize=False, copy=True,
             sample_weight = None
         if sp.issparse(X):
             X_mean = np.zeros(X.shape[1])
-            X_norm = np.ones(X.shape[1])
+            X_std = np.ones(X.shape[1])
         else:
             X_mean = np.average(X, axis=0, weights=sample_weight)
             X -= X_mean
             if normalize:
-                # XXX: currently scaled to variance=n_samples
-                X_norm = np.sqrt(np.sum(X ** 2, axis=0))
-                X_norm[X_norm == 0] = 1
-                X /= X_norm
+                X_std = np.sqrt(np.sum(X ** 2, axis=0))
+                X_std[X_std == 0] = 1
+                X /= X_std
             else:
-                X_norm = np.ones(X.shape[1])
+                X_std = np.ones(X.shape[1])
         y_mean = np.average(y, axis=0, weights=sample_weight)
         y = y - y_mean
     else:
         X_mean = np.zeros(X.shape[1])
-        X_norm = np.ones(X.shape[1])
+        X_std = np.ones(X.shape[1])
         y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
+    return X, y, X_mean, y_mean, X_std
+
+
+def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
+                     sample_weight=None, return_mean=False):
+    """
+    Centers data to have mean zero along axis 0. If fit_intercept=False or if
+    the X is a sparse matrix, no centering is done, but normalization can still
+    be applied. The function returns the statistics necessary to reconstruct
+    the input data, which are X_offset, y_offset, X_scale, such that the output
+
+        X = (X - X_offset) / X_scale
+
+    If sample_weight is not None, then the weighted mean of X and y
+    is zero, and not the mean itself. If return_mean=True, the mean, eventually
+    weighted, is returned, independently of whether X was centered (option used
+    for optimization with sparse data in coordinate_descend).
+
+    This is here because nearly all linear models will want their data to be
+    centered.
+    """
 
-    return X, y, X_mean, y_mean, X_norm
+    if isinstance(sample_weight, numbers.Number):
+        sample_weight = None
 
+    X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
+                    dtype=FLOAT_DTYPES)
+
+    if fit_intercept:
+        if sp.issparse(X):
+            X_offset, X_var = mean_variance_axis(X, axis=0)
+            if not return_mean:
+                X_offset = np.zeros(X.shape[1])
+
+            if normalize:
+
+                # TODO: f_normalize could be used here as well but the function
+                # inplace_csr_row_normalize_l2 must be changed such that it
+                # can return also the norms computed internally
+
+                # transform variance to norm in-place
+                X_var *= X.shape[0]
+                X_scale = np.sqrt(X_var, X_var)
+                del X_var
+                X_scale[X_scale == 0] = 1
+                inplace_column_scale(X, 1. / X_scale)
+            else:
+                X_scale = np.ones(X.shape[1])
+
+        else:
+            X_offset = np.average(X, axis=0, weights=sample_weight)
+            X -= X_offset
+            if normalize:
+                X, X_scale = f_normalize(X, axis=0, copy=False,
+                                         return_norm=True)
+            else:
+                X_scale = np.ones(X.shape[1])
+        y_offset = np.average(y, axis=0, weights=sample_weight)
+        y = y - y_offset
+    else:
+        X_offset = np.zeros(X.shape[1])
+        X_scale = np.ones(X.shape[1])
+        y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
+
+    return X, y, X_offset, y_offset, X_scale
+
+
+# TODO: _rescale_data should be factored into _preprocess_data.
+# Currently, the fact that sag implements its own way to deal with
+# sample_weight makes the refactoring tricky.
 
 def _rescale_data(X, y, sample_weight):
     """Rescale data so as to support sample_weight"""
+    sample_weight = sample_weight * np.ones(y.shape[0])
     sample_weight = np.sqrt(sample_weight)
-    if not isinstance(sample_weight, np.ndarray):  # scalar case
-        sample_weight = sample_weight * np.ones(y.shape[0])
-
     sw_matrix = np.diag(sample_weight)
     if sp.issparse(X) or sp.issparse(y):
         sw_matrix = sparse.dia_matrix(sw_matrix)
@@ -202,13 +265,13 @@ def predict(self, X):
         """
         return self._decision_function(X)
 
-    _center_data = staticmethod(center_data)
+    _preprocess_data = staticmethod(_preprocess_data)
 
     def _set_intercept(self, X_mean, y_mean, X_norm):
         """Set the intercept_
         """
-        self.coef_ = self.coef_ / X_norm
         if self.fit_intercept:
+            self.coef_ = self.coef_ / X_norm
             self.intercept_ = y_mean - np.dot(X_mean, self.coef_.T)
         else:
             self.intercept_ = 0.
@@ -362,9 +425,13 @@ class LinearRegression(LinearModel, RegressorMixin):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
-        Normalization makes the `coef_` independent from the number of training
-        samples. If you wish to standardize instead, please use
-        `preprocessing.StandardScaler` before calling `fit`.
+        When the regressors are normalized, the fitted `coef_` are the same
+        independently of the number of training samples; hence, hyperparameters
+        learnt by cross-validation will be compatible among different training
+        and validation sets. The same property is not valid for standardized
+        data. However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If True, X will be copied; else, it may be overwritten.
@@ -440,11 +507,10 @@ def fit(self, X, y, sample_weight=None):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          y_numeric=True, multi_output=True)
 
-        if ((sample_weight is not None) and
-           np.atleast_1d(sample_weight).ndim > 1):
+        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
             raise ValueError("Sample weights must be 1D array or scalar")
 
-        X, y, X_mean, y_mean, X_norm = self._center_data(
+        X, y, X_mean, y_mean, X_norm = self._preprocess_data(
             X, y, fit_intercept=self.fit_intercept, normalize=self.normalize,
             copy=self.copy_X, sample_weight=sample_weight)
 
@@ -481,11 +547,12 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
 
     if sparse.isspmatrix(X):
         precompute = False
-        X, y, X_mean, y_mean, X_norm = sparse_center_data(
-            X, y, fit_intercept=fit_intercept, normalize=normalize)
+        X, y, X_mean, y_mean, X_norm = _preprocess_data(
+            X, y, fit_intercept=fit_intercept, normalize=normalize,
+            return_mean=True)
     else:
         # copy was done in fit if necessary
-        X, y, X_mean, y_mean, X_norm = center_data(
+        X, y, X_mean, y_mean, X_norm = _preprocess_data(
             X, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy)
     if hasattr(precompute, '__array__') and (
             fit_intercept and not np.allclose(X_mean, np.zeros(n_features)) or
 
@@ -65,9 +65,13 @@ class BayesianRidge(LinearModel, RegressorMixin):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
-        Normalization makes the `coef_` independent from the number of training
-        samples. If you wish to standardize instead, please use
-        `preprocessing.StandardScaler` before calling `fit`.
+        When the regressors are normalized, the fitted `coef_` are the same
+        independently of the number of training samples; hence, hyperparameters
+        learnt by cross-validation will be compatible among different training
+        and validation sets. The same property is not valid for standardized
+        data. However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True
         If True, X will be copied; else, it may be overwritten.
@@ -138,7 +142,7 @@ def fit(self, X, y):
         self : returns an instance of self.
         """
         X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True)
-        X, y, X_mean, y_mean, X_std = self._center_data(
+        X, y, X_mean, y_mean, X_std = self._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)
         n_samples, n_features = X.shape
 
@@ -272,9 +276,13 @@ class ARDRegression(LinearModel, RegressorMixin):
 
     normalize : boolean, optional, default False
         If True, the regressors X will be normalized before regression.
-        Normalization makes the `coef_` independent from the number of training
-        samples. If you wish to standardize instead, please use
-        `preprocessing.StandardScaler` before calling `fit`.
+        When the regressors are normalized, the fitted `coef_` are the same
+        independently of the number of training samples; hence, hyperparameters
+        learnt by cross-validation will be compatible among different training
+        and validation sets. The same property is not valid for standardized
+        data. However, if you wish to standardize, please use
+        `preprocessing.StandardScaler` before calling `fit` on an estimator
+        with `normalize=False`.
 
     copy_X : boolean, optional, default True.
         If True, X will be copied; else, it may be overwritten.
@@ -357,7 +365,7 @@ def fit(self, X, y):
         n_samples, n_features = X.shape
         coef_ = np.zeros(n_features)
 
-        X, y, X_mean, y_mean, X_std = self._center_data(
+        X, y, X_mean, y_mean, X_std = self._preprocess_data(
             X, y, self.fit_intercept, self.normalize, self.copy_X)
 
         # Launch the convergence loop