From 5f95cbd651ebbe7f2ef9c1ed63d1008539e144a5 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Thu, 10 Dec 2015 10:49:32 -0500 Subject: [PATCH 1/3] make sparse encoder map copy on write arrays as gram omp modifies in place --- sklearn/decomposition/dict_learning.py | 2 +- sklearn/decomposition/tests/test_dict_learning.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index 990baef4c8c78..b1ae26d88d4be 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -288,7 +288,7 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars', code = np.empty((n_samples, n_components)) slices = list(gen_even_slices(n_samples, _get_n_jobs(n_jobs))) - code_views = Parallel(n_jobs=n_jobs, verbose=verbose)( + code_views = Parallel(n_jobs=n_jobs, verbose=verbose, mmap_mode='c')( delayed(_sparse_encode)( X[this_slice], dictionary, gram, cov[:, this_slice] if cov is not None else None, diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 65e3fc99d1742..4b1c44b555a2e 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -238,3 +238,14 @@ def test_sparse_coder_estimator(): transform_alpha=0.001).transform(X) assert_true(not np.all(code == 0)) assert_less(np.sqrt(np.sum((np.dot(code, V) - X) ** 2)), 0.1) + + +def test_sparse_coder_mmap(): + # Test that SparseCoder does not error by passing reading only + # arrays to child processes + + rng = np.random.RandomState(777) + init_dict = rng.rand(500, 64) + data = np.random.rand(8096, 64) + sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2) + sc.fit_transform(data) From 8c26c02adc634207c17245bfb9430f617db0c391 Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Tue, 19 Apr 2016 18:03:57 -0400 Subject: [PATCH 2/3] Added comments to the unit test --- sklearn/decomposition/tests/test_dict_learning.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 4b1c44b555a2e..0bbb76bd52789 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -245,7 +245,12 @@ def test_sparse_coder_mmap(): # arrays to child processes rng = np.random.RandomState(777) - init_dict = rng.rand(500, 64) - data = np.random.rand(8096, 64) + num_cols = 64 + init_dict = rng.rand(500, num_cols) + # Ensure that `data` is >2M. Joblib memory maps arrays + # if they are larger than 1MB. The 4 accounts for float32 + # data type + num_rows = (1024*1024*2)/(4*num_cols) + data = np.random.rand(num_rows, num_cols).astype(np.float32) sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2) sc.fit_transform(data) From b3840bb1a01829a9d3a81a425922bcc86d9a66af Mon Sep 17 00:00:00 2001 From: Vighnesh Birodkar Date: Wed, 27 Apr 2016 19:12:31 -0400 Subject: [PATCH 3/3] Made check_copy_writeable public and used it in _sparse_encode --- sklearn/decomposition/dict_learning.py | 7 +++++-- sklearn/linear_model/least_angle.py | 16 ++++++---------- sklearn/utils/__init__.py | 4 ++-- sklearn/utils/tests/test_validation.py | 9 +++++++++ sklearn/utils/validation.py | 16 ++++++++++++++++ 5 files changed, 38 insertions(+), 14 deletions(-) diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index b1ae26d88d4be..2f68191da58e7 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -18,7 +18,7 @@ from ..externals.joblib import Parallel, delayed, cpu_count from ..externals.six.moves import zip from ..utils import (check_array, check_random_state, gen_even_slices, - gen_batches, _get_n_jobs) + gen_batches, _get_n_jobs, check_copy_and_writeable) from ..utils.extmath import randomized_svd, row_norms from ..utils.validation import check_is_fitted from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars @@ -91,6 +91,9 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', sklearn.linear_model.Lasso SparseCoder """ + if cov is not None: + cov = check_copy_and_writeable(cov, copy_cov) + if X.ndim == 1: X = X[:, np.newaxis] n_samples, n_features = X.shape @@ -288,7 +291,7 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars', code = np.empty((n_samples, n_components)) slices = list(gen_even_slices(n_samples, _get_n_jobs(n_jobs))) - code_views = Parallel(n_jobs=n_jobs, verbose=verbose, mmap_mode='c')( + code_views = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_sparse_encode)( X[this_slice], dictionary, gram, cov[:, this_slice] if cov is not None else None, diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index 9fce600950c9e..e1936c351d0b2 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -21,7 +21,8 @@ from .base import LinearModel from ..base import RegressorMixin -from ..utils import arrayfuncs, as_float_array, check_X_y +from ..utils import (arrayfuncs, as_float_array, check_X_y, + check_copy_and_writeable) from ..model_selection import check_cv from ..exceptions import ConvergenceWarning from ..externals.joblib import Parallel, delayed @@ -847,11 +848,6 @@ def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, ############################################################################### # Cross-validated estimator classes -def _check_copy_and_writeable(array, copy=False): - if copy or not array.flags.writeable: - return array.copy() - return array - def _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None, copy=True, method='lars', verbose=False, @@ -938,10 +934,10 @@ def _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None, residues : array, shape (n_alphas, n_samples) Residues of the prediction on the test data """ - X_train = _check_copy_and_writeable(X_train, copy) - y_train = _check_copy_and_writeable(y_train, copy) - X_test = _check_copy_and_writeable(X_test, copy) - y_test = _check_copy_and_writeable(y_test, copy) + X_train = check_copy_and_writeable(X_train, copy) + y_train = check_copy_and_writeable(y_train, copy) + X_test = check_copy_and_writeable(X_test, copy) + y_test = check_copy_and_writeable(y_test, copy) if fit_intercept: X_mean = X_train.mean(axis=0) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 4d62e4f394180..b6bb07c593c0b 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -12,7 +12,7 @@ assert_all_finite, check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, - check_symmetric) + check_symmetric, check_copy_and_writeable) from .deprecation import deprecated from .class_weight import compute_class_weight, compute_sample_weight from ..externals.joblib import cpu_count @@ -32,7 +32,7 @@ class ConvergenceWarning(_ConvergenceWarning): "compute_class_weight", "compute_sample_weight", "column_or_1d", "safe_indexing", "check_consistent_length", "check_X_y", 'indexable', - "check_symmetric"] + "check_symmetric", "check_copy_and_writeable"] def safe_mask(X, mask): diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index d577864fb709a..c386f6bb36f5b 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -29,6 +29,7 @@ has_fit_parameter, check_is_fitted, check_consistent_length, + check_copy_and_writeable ) from sklearn.exceptions import NotFittedError @@ -459,3 +460,11 @@ def test_check_consistent_length(): assert_raises_regexp(TypeError, 'estimator', check_consistent_length, [1, 2], RandomForestRegressor()) # XXX: We should have a test with a string, but what is correct behaviour? + + +def test_check_copy_and_writeable(): + X = np.zeros((10, 10)) + assert check_copy_and_writeable(X).flags.writeable + + X.flags.writeable = False + assert check_copy_and_writeable(X).flags.writeable diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 26bc3a8479f06..efdda70f0fbf6 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -700,3 +700,19 @@ def check_non_negative(X, whom): X = X.data if sp.issparse(X) else X if (X < 0).any(): raise ValueError("Negative values in data passed to %s" % whom) + + +def check_copy_and_writeable(X, copy=False): + """Return a writeable copy of the array + + Parameters + ---------- + X : array + Input array. + copy : bool + If set, a copy of `X` is made irrespective of it being writeable + or not. + """ + if copy or not X.flags.writeable: + return X.copy() + return X