From 2ed0c22c170e119ea254b2619a7c4c278e8c65b3 Mon Sep 17 00:00:00 2001 From: Arthur Mensch Date: Thu, 20 Aug 2015 11:36:24 +0200 Subject: [PATCH] Added check_input in Lasso for DictLearning performance --- doc/whats_new.rst | 4 ++ sklearn/decomposition/dict_learning.py | 32 ++++++++--- sklearn/linear_model/base.py | 22 ++++++-- sklearn/linear_model/coordinate_descent.py | 55 +++++++++++++------ .../tests/test_coordinate_descent.py | 38 +++++++++++++ 5 files changed, 118 insertions(+), 33 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index e45e208890ee5..ff975e8577c52 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -105,6 +105,10 @@ Enhancements with ``n_jobs > 1`` used with a large grid of parameters on a small dataset. By `Vlad Niculae`_, `Olivier Grisel`_ and `Loic Esteve`_. + - Improved speed (3 times per iteration) of + :class:`decomposition.DictLearning` with coordinate descent method + from :class:`linear_model.Lasso`. By `Arthur Mensch`_. + Bug fixes ......... diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index e0a22fce086b9..fce61ef95ad6d 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -107,10 +107,10 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', elif algorithm == 'lasso_cd': alpha = float(regularization) / n_features # account for scaling - clf = Lasso(alpha=alpha, fit_intercept=False, precompute=gram, - max_iter=max_iter, warm_start=True) + clf = Lasso(alpha=alpha, fit_intercept=False, normalize=False, + precompute=gram, max_iter=max_iter, warm_start=True) clf.coef_ = init - clf.fit(dictionary.T, X.T) + clf.fit(dictionary.T, X.T, check_input=False) new_code = clf.coef_ elif algorithm == 'lars': @@ -224,8 +224,10 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars', n_components = dictionary.shape[0] if gram is None and algorithm != 'threshold': - gram = np.dot(dictionary, dictionary.T) - if cov is None: + # Transposing product to ensure Fortran ordering + gram = np.dot(dictionary, dictionary.T).T + + if cov is None and algorithm != 'lasso_cd': copy_cov = False cov = np.dot(dictionary, X.T) @@ -239,10 +241,17 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars', regularization = 1. if n_jobs == 1 or algorithm == 'threshold': - return _sparse_encode(X, dictionary, gram, cov=cov, + code = _sparse_encode(X, + dictionary, gram, cov=cov, algorithm=algorithm, regularization=regularization, copy_cov=copy_cov, - init=init, max_iter=max_iter) + init=init, + max_iter=max_iter) + # This ensure that dimensionality of code is always 2, + # consistant with the case n_jobs > 1 + if code.ndim == 1: + code = code[np.newaxis, :] + return code # Enter parallel code block code = np.empty((n_samples, n_components)) @@ -250,7 +259,9 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars', code_views = Parallel(n_jobs=n_jobs)( delayed(_sparse_encode)( - X[this_slice], dictionary, gram, cov[:, this_slice], algorithm, + X[this_slice], dictionary, gram, + cov[:, this_slice] if cov is not None else None, + algorithm, regularization=regularization, copy_cov=copy_cov, init=init[this_slice] if init is not None else None, max_iter=max_iter) @@ -639,7 +650,6 @@ def dict_learning_online(X, n_components=2, alpha=1, n_iter=100, else: dictionary = np.r_[dictionary, np.zeros((n_components - r, dictionary.shape[1]))] - dictionary = np.ascontiguousarray(dictionary.T) if verbose == 1: print('[dict_learning]', end=' ') @@ -650,6 +660,10 @@ def dict_learning_online(X, n_components=2, alpha=1, n_iter=100, else: X_train = X + dictionary = check_array(dictionary.T, order='F', dtype=np.float64, + copy=False) + X_train = check_array(X_train, order='C', dtype=np.float64, copy=False) + batches = gen_batches(n_samples, batch_size) batches = itertools.cycle(batches) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index f0563f4747f66..88b5b41efe346 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -15,6 +15,7 @@ from __future__ import division from abc import ABCMeta, abstractmethod import numbers +import warnings import numpy as np import scipy.sparse as sp @@ -397,7 +398,8 @@ def fit(self, X, y): return self -def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy): +def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy, + Xy_precompute_order=None): """Aux function used at beginning of fit in linear models""" n_samples, n_features = X.shape if sparse.isspmatrix(X): @@ -408,10 +410,13 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy): # copy was done in fit if necessary X, y, X_mean, y_mean, X_std = center_data( X, y, fit_intercept, normalize, copy=copy) - - if hasattr(precompute, '__array__') \ - and not np.allclose(X_mean, np.zeros(n_features)) \ - and not np.allclose(X_std, np.ones(n_features)): + if hasattr(precompute, '__array__') and ( + fit_intercept and not np.allclose(X_mean, np.zeros(n_features)) + or normalize and not np.allclose(X_std, np.ones(n_features))): + warnings.warn("Gram matrix was provided but X was centered" + " to fit intercept, " + "or X was normalized : recomputing Gram matrix.", + UserWarning) # recompute Gram precompute = 'auto' Xy = None @@ -422,11 +427,16 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy): if precompute is True: precompute = np.dot(X.T, X) + if Xy_precompute_order == 'F': + precompute = np.dot(X.T, X).T if not hasattr(precompute, '__array__'): Xy = None # cannot use Xy if precompute is not Gram if hasattr(precompute, '__array__') and Xy is None: - Xy = np.dot(X.T, y) + if Xy_precompute_order == 'F': + Xy = np.dot(y.T, X).T + else: + Xy = np.dot(X.T, y) return X, y, X_mean, y_mean, X_std, precompute, Xy diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py index f99b20573a364..b6b016605bcf0 100644 --- a/sklearn/linear_model/coordinate_descent.py +++ b/sklearn/linear_model/coordinate_descent.py @@ -359,11 +359,18 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, ElasticNet ElasticNetCV """ - X = check_array(X, 'csc', dtype=np.float64, order='F', copy=copy_X) - y = check_array(y, 'csc', dtype=np.float64, order='F', copy=False, ensure_2d=False) - if Xy is not None: - Xy = check_array(Xy, 'csc', dtype=np.float64, order='F', copy=False, - ensure_2d=False) + # We expect X and y to be already float64 Fortran ordered when bypassing + # checks + check_input = 'check_input' not in params or params['check_input'] + pre_fit = 'check_input' not in params or params['pre_fit'] + if check_input: + X = check_array(X, 'csc', dtype=np.float64, order='F', copy=copy_X) + y = check_array(y, 'csc', dtype=np.float64, order='F', copy=False, + ensure_2d=False) + if Xy is not None: + Xy = check_array(Xy, 'csc', dtype=np.float64, order='F', + copy=False, + ensure_2d=False) n_samples, n_features = X.shape multi_output = False @@ -380,10 +387,13 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, else: X_sparse_scaling = np.zeros(n_features) - # X should be normalized and fit already. - X, y, X_mean, y_mean, X_std, precompute, Xy = \ - _pre_fit(X, y, Xy, precompute, normalize=False, fit_intercept=False, - copy=False) + # X should be normalized and fit already if function is called + # from ElasticNet.fit + if pre_fit: + X, y, X_mean, y_mean, X_std, precompute, Xy = \ + _pre_fit(X, y, Xy, precompute, normalize=False, + fit_intercept=False, + copy=False, Xy_precompute_order='F') if alphas is None: # No need to normalize of fit_intercept: it has been done # above @@ -428,7 +438,11 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, model = cd_fast.enet_coordinate_descent_multi_task( coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random) elif isinstance(precompute, np.ndarray): - precompute = check_array(precompute, 'csc', dtype=np.float64, order='F') + # We expect precompute to be already Fortran ordered when bypassing + # checks + if check_input: + precompute = check_array(precompute, 'csc', dtype=np.float64, + order='F') model = cd_fast.enet_coordinate_descent_gram( coef_, l1_reg, l2_reg, precompute, Xy, y, max_iter, tol, rng, random, positive) @@ -601,7 +615,7 @@ def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True, self.random_state = random_state self.selection = selection - def fit(self, X, y): + def fit(self, X, y, check_input=True): """Fit model with coordinate descent. Parameters @@ -622,6 +636,7 @@ def fit(self, X, y): To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ + if self.alpha == 0: warnings.warn("With alpha=0, this algorithm does not converge " "well. You are advised to use the LinearRegression " @@ -632,14 +647,16 @@ def fit(self, X, y): "slower even when n_samples > n_features. Hence " "it will be removed in 0.18.", DeprecationWarning, stacklevel=2) - - X, y = check_X_y(X, y, accept_sparse='csc', dtype=np.float64, - order='F', copy=self.copy_X and self.fit_intercept, - multi_output=True, y_numeric=True) - + # We expect X and y to be already float64 Fortran ordered arrays + # when bypassing checks + if check_input: + X, y = check_X_y(X, y, accept_sparse='csc', dtype=np.float64, + order='F', + copy=self.copy_X and self.fit_intercept, + multi_output=True, y_numeric=True) X, y, X_mean, y_mean, X_std, precompute, Xy = \ _pre_fit(X, y, None, self.precompute, self.normalize, - self.fit_intercept, copy=True) + self.fit_intercept, copy=False, Xy_precompute_order='F') if y.ndim == 1: y = y[:, np.newaxis] @@ -678,7 +695,9 @@ def fit(self, X, y): X_mean=X_mean, X_std=X_std, return_n_iter=True, coef_init=coef_[k], max_iter=self.max_iter, random_state=self.random_state, - selection=self.selection) + selection=self.selection, + check_input=False, + pre_fit=False) coef_[k] = this_coef[:, 0] dual_gaps_[k] = this_dual_gap[0] self.n_iter_.append(this_iter[0]) diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index da7fed216089c..be50a86b23b90 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -17,6 +17,7 @@ from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_warns +from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import TempMemmap @@ -25,6 +26,7 @@ LassoCV, ElasticNet, ElasticNetCV, MultiTaskLasso, MultiTaskElasticNet, \ MultiTaskElasticNetCV, MultiTaskLassoCV, lasso_path, enet_path from sklearn.linear_model import LassoLarsCV, lars_path +from sklearn.utils import check_array def check_warnings(): @@ -628,3 +630,39 @@ def test_sparse_dense_descent_paths(): _, coefs, _ = path(X, y, fit_intercept=False) _, sparse_coefs, _ = path(csr, y, fit_intercept=False) assert_array_almost_equal(coefs, sparse_coefs) + + +def test_check_input_false(): + X, y, _, _ = build_dataset(n_samples=20, n_features=10) + X = check_array(X, order='F', dtype='float64') + y = check_array(X, order='F', dtype='float64') + clf = ElasticNet(selection='cyclic', tol=1e-8) + # Check that no error is raised if data is provided in the right format + clf.fit(X, y, check_input=False) + X = check_array(X, order='F', dtype='float32') + clf.fit(X, y, check_input=True) + # Check that an error is raised if data is provided in the wrong format, + # because of check bypassing + assert_raises(ValueError, clf.fit, X, y, check_input=False) + + # With no input checking, providing X in C order should result in false + # computation + X = check_array(X, order='C', dtype='float64') + clf.fit(X, y, check_input=False) + coef_false = clf.coef_ + clf.fit(X, y, check_input=True) + coef_true = clf.coef_ + assert_raises(AssertionError, assert_array_almost_equal, + coef_true, coef_false) + + +def test_overrided_gram_matrix(): + X, y, _, _ = build_dataset(n_samples=20, n_features=10) + Gram = X.T.dot(X) + clf = ElasticNet(selection='cyclic', tol=1e-8, precompute=Gram, + fit_intercept=True) + assert_warns_message(UserWarning, + "Gram matrix was provided but X was centered" + " to fit intercept, " + "or X was normalized : recomputing Gram matrix.", + clf.fit, X, y)