From ce065bad325aa4cf322f618af83e0d4071087ea7 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Sun, 17 Jan 2021 17:31:24 +0100 Subject: [PATCH 1/9] several fixes to dict update in dict learning --- sklearn/decomposition/_dict_learning.py | 133 ++++++++++-------------- 1 file changed, 57 insertions(+), 76 deletions(-) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 046738aa9700d..e323d9a5f2c84 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -354,28 +354,30 @@ def sparse_encode(X, dictionary, *, gram=None, cov=None, return code -def _update_dict(dictionary, Y, code, verbose=False, return_r2=False, - random_state=None, positive=False): +def _update_dict(dictionary, Y, code, A=None, B=None, verbose=False, + return_r2=False, random_state=None, positive=False): """Update the dense dictionary factor in place. Parameters ---------- - dictionary : ndarray of shape (n_features, n_components) + dictionary : ndarray of shape (n_components, n_features) Value of the dictionary at the previous iteration. - Y : ndarray of shape (n_features, n_samples) + Y : ndarray of shape (n_samples, n_features) Data matrix. - code : ndarray of shape (n_components, n_samples) + code : ndarray of shape (n_samples, n_components) Sparse coding of the data against which to optimize the dictionary. + A : ndarray of shape (n_components, n_components), default=None + With B, sufficient stats of the online model. + + B : ndarray of shape (n_features, n_components), default=None + With A, sufficient stats of the online model. + verbose: bool, default=False Degree of output the procedure will print. - return_r2 : bool, default=False - Whether to compute and return the residual sum of squares corresponding - to the computed solution. - random_state : int, RandomState instance or None, default=None Used for randomly initializing the dictionary. Pass an int for reproducible results across multiple function calls. @@ -385,54 +387,37 @@ def _update_dict(dictionary, Y, code, verbose=False, return_r2=False, Whether to enforce positivity when finding the dictionary. .. versionadded:: 0.20 - - Returns - ------- - dictionary : ndarray of shape (n_features, n_components) - Updated dictionary. """ - n_components = len(code) - n_features = Y.shape[0] + n_samples, n_components = code.shape random_state = check_random_state(random_state) - # Get BLAS functions - gemm, = linalg.get_blas_funcs(('gemm',), (dictionary, code, Y)) - ger, = linalg.get_blas_funcs(('ger',), (dictionary, code)) - nrm2, = linalg.get_blas_funcs(('nrm2',), (dictionary,)) - # Residuals, computed with BLAS for speed and efficiency - # R <- -1.0 * U * V^T + 1.0 * Y - # Outputs R as Fortran array for efficiency - R = gemm(-1.0, dictionary, code, 1.0, Y) + + if A is None: + A = np.dot(code.T, code) + if B is None: + B = np.dot(Y.T, code) + for k in range(n_components): - # R <- 1.0 * U_k * V_k^T + R - R = ger(1.0, dictionary[:, k], code[k, :], a=R, overwrite_a=True) - dictionary[:, k] = np.dot(R, code[k, :]) - if positive: - np.clip(dictionary[:, k], 0, None, out=dictionary[:, k]) - # Scale k'th atom - # (U_k * U_k) ** 0.5 - atom_norm = nrm2(dictionary[:, k]) - if atom_norm < 1e-10: + if A[k, k] > 1e-6: + dictionary[k] += (B[:, k] - A[k].dot(dictionary)) / A[k, k] + else: + # kth atom is almost never used -> sample a new one from the data if verbose == 1: sys.stdout.write("+") sys.stdout.flush() elif verbose: print("Adding new random atom") - dictionary[:, k] = random_state.randn(n_features) - if positive: - np.clip(dictionary[:, k], 0, None, out=dictionary[:, k]) - # Setting corresponding coefs to 0 - code[k, :] = 0.0 - # (U_k * U_k) ** 0.5 - atom_norm = nrm2(dictionary[:, k]) - dictionary[:, k] /= atom_norm - else: - dictionary[:, k] /= atom_norm - # R <- -1.0 * U_k * V_k^T + R - R = ger(-1.0, dictionary[:, k], code[k, :], a=R, overwrite_a=True) - if return_r2: - R = nrm2(R) ** 2.0 - return dictionary, R - return dictionary + + newd = Y[random_state.choice(n_samples)] + # add small noise to avoid making the sparse coding ill conditioned + noise = random_state.normal(0, 0.01 * newd.std(), size=len(newd)) + dictionary[k] = newd + noise + code[:, k] = 0 + + if positive: + np.clip(dictionary[k], 0, None, out=dictionary[k]) + + # Projection on the constraint ||V_k|| == 1 + dictionary[k] /= linalg.norm(dictionary[k]) @_deprecate_positional_args @@ -574,11 +559,10 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8, dictionary = np.r_[dictionary, np.zeros((n_components - r, dictionary.shape[1]))] - # Fortran-order dict, as we are going to access its row vectors + # Fortran-order dict better suited for the sparse coding which is the + # bottleneck of this algorithm. dictionary = np.array(dictionary, order='F') - residuals = 0 - errors = [] current_cost = np.nan @@ -602,15 +586,14 @@ def dict_learning(X, n_components, *, alpha, max_iter=100, tol=1e-8, code = sparse_encode(X, dictionary, algorithm=method, alpha=alpha, init=code, n_jobs=n_jobs, positive=positive_code, max_iter=method_max_iter, verbose=verbose) - # Update dictionary - dictionary, residuals = _update_dict(dictionary.T, X.T, code.T, - verbose=verbose, return_r2=True, - random_state=random_state, - positive=positive_dict) - dictionary = dictionary.T + + # Update dictionary in place + _update_dict(dictionary, X, code, verbose=verbose, + random_state=random_state, positive=positive_dict) # Cost function - current_cost = 0.5 * residuals + alpha * np.sum(np.abs(code)) + current_cost = (0.5 * np.sum((X - code @ dictionary)**2) + + alpha * np.sum(np.abs(code))) errors.append(current_cost) if ii > 0: @@ -802,7 +785,7 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, else: X_train = X - dictionary = check_array(dictionary.T, order='F', dtype=np.float64, + dictionary = check_array(dictionary, order='F', dtype=np.float64, copy=False) dictionary = np.require(dictionary, requirements='W') @@ -834,11 +817,11 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, print("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)" % (ii, dt, dt / 60)) - this_code = sparse_encode(this_X, dictionary.T, algorithm=method, + this_code = sparse_encode(this_X, dictionary, algorithm=method, alpha=alpha, n_jobs=n_jobs, check_input=False, positive=positive_code, - max_iter=method_max_iter, verbose=verbose).T + max_iter=method_max_iter, verbose=verbose) # Update the auxiliary variables if ii < batch_size - 1: @@ -848,15 +831,13 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, beta = (theta + 1 - batch_size) / (theta + 1) A *= beta - A += np.dot(this_code, this_code.T) + A += np.dot(this_code.T, this_code) B *= beta - B += np.dot(this_X.T, this_code.T) + B += np.dot(this_X.T, this_code) - # Update dictionary - dictionary = _update_dict(dictionary, B, A, verbose=verbose, - random_state=random_state, - positive=positive_dict) - # XXX: Can the residuals be of any use? + # Update dictionary in place + _update_dict(dictionary, this_X, this_code, A, B, verbose=verbose, + random_state=random_state, positive=positive_dict) # Maybe we need a stopping criteria based on the amount of # modification in the dictionary @@ -865,15 +846,15 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, if return_inner_stats: if return_n_iter: - return dictionary.T, (A, B), ii - iter_offset + 1 + return dictionary, (A, B), ii - iter_offset + 1 else: - return dictionary.T, (A, B) + return dictionary, (A, B) if return_code: if verbose > 1: print('Learning code...', end=' ') elif verbose == 1: print('|', end=' ') - code = sparse_encode(X, dictionary.T, algorithm=method, alpha=alpha, + code = sparse_encode(X, dictionary, algorithm=method, alpha=alpha, n_jobs=n_jobs, check_input=False, positive=positive_code, max_iter=method_max_iter, verbose=verbose) @@ -881,14 +862,14 @@ def dict_learning_online(X, n_components=2, *, alpha=1, n_iter=100, dt = (time.time() - t0) print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60)) if return_n_iter: - return code, dictionary.T, ii - iter_offset + 1 + return code, dictionary, ii - iter_offset + 1 else: - return code, dictionary.T + return code, dictionary if return_n_iter: - return dictionary.T, ii - iter_offset + 1 + return dictionary, ii - iter_offset + 1 else: - return dictionary.T + return dictionary class _BaseSparseCoding(TransformerMixin): From d78859bdb522340c4941ca8f2d49f0ddba7affeb Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 18 Jan 2021 18:09:02 +0100 Subject: [PATCH 2/9] fix docstrings --- sklearn/decomposition/_dict_learning.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index e323d9a5f2c84..395f3295863c7 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1250,7 +1250,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): We can check the level of sparsity of `X_transformed`: >>> np.mean(X_transformed == 0) - 0.88... + 0.87... We can compare the average squared euclidean norm of the reconstruction error of the sparse coded signal relative to the squared euclidean norm of @@ -1258,7 +1258,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): >>> X_hat = X_transformed @ dict_learner.components_ >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1)) - 0.07... + 0.08... Notes ----- @@ -1490,7 +1490,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): We can check the level of sparsity of `X_transformed`: >>> np.mean(X_transformed == 0) - 0.87... + 0.86... We can compare the average squared euclidean norm of the reconstruction error of the sparse coded signal relative to the squared euclidean norm of @@ -1498,7 +1498,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): >>> X_hat = X_transformed @ dict_learner.components_ >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1)) - 0.10... + 0.07... Notes ----- From 66553aab86f3695c3dafb362ce1cc8a57fcc5064 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 18 Jan 2021 18:31:47 +0100 Subject: [PATCH 3/9] avoid noise with 0 std --- sklearn/decomposition/_dict_learning.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 395f3295863c7..7225b122103b5 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -408,8 +408,11 @@ def _update_dict(dictionary, Y, code, A=None, B=None, verbose=False, print("Adding new random atom") newd = Y[random_state.choice(n_samples)] + # add small noise to avoid making the sparse coding ill conditioned - noise = random_state.normal(0, 0.01 * newd.std(), size=len(newd)) + noise_level = 0.01 * (newd.std() or 1) # avoid 0 std + noise = random_state.normal(0, noise_level, size=len(newd)) + dictionary[k] = newd + noise code[:, k] = 0 From 2885c21158abadb0dd9fb102a511086898d8e524 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 19 Jan 2021 14:36:14 +0100 Subject: [PATCH 4/9] add test for dict update --- .../decomposition/tests/test_dict_learning.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index c9590f3136678..31139dec9093a 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -10,6 +10,7 @@ from sklearn.utils import check_array +from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import ignore_warnings @@ -25,6 +26,8 @@ from sklearn.utils.estimator_checks import check_transformer_general from sklearn.utils.estimator_checks import check_transformers_unfitted +from sklearn.decomposition._dict_learning import _update_dict + rng_global = np.random.RandomState(0) n_samples, n_features = 10, 8 @@ -573,3 +576,32 @@ def test_sparse_coder_n_features_in(): d = np.array([[1, 2, 3], [1, 2, 3]]) sc = SparseCoder(d) assert sc.n_features_in_ == d.shape[1] + + +def test_update_dict(): + # Check that the dict update is correct in batch and online mode + code = np.array([[0.5, -0.5], + [0.1, 0.9]]) + dictionary = np.array([[1., 0.], + [0.6, 0.8]]) + + # Create X s.t. the dictionary is already optimal + X = np.dot(code, dictionary) + + # Since ||d_k|| = 1, we expect the update to not change the dictionary. + expected = dictionary.copy() + + # batch mode + new_dict = dictionary.copy() + _update_dict(new_dict, X, code) + + assert_allclose(new_dict, expected) + + # online mode + A = np.dot(code.T, code) + B = np.dot(X.T, code) + + new_dict = dictionary.copy() + _update_dict(new_dict, X, code, A, B) + + assert_allclose(new_dict, expected) From bd30598e6aba1a8c8611678181603e41555bfb89 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 19 Jan 2021 16:53:08 +0100 Subject: [PATCH 5/9] better test --- .../decomposition/tests/test_dict_learning.py | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 31139dec9093a..14e1524f00802 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -579,29 +579,25 @@ def test_sparse_coder_n_features_in(): def test_update_dict(): - # Check that the dict update is correct in batch and online mode + # Check the dict update in batch mode vs online mode + rng = np.random.RandomState(0) + code = np.array([[0.5, -0.5], [0.1, 0.9]]) dictionary = np.array([[1., 0.], [0.6, 0.8]]) - # Create X s.t. the dictionary is already optimal - X = np.dot(code, dictionary) - - # Since ||d_k|| = 1, we expect the update to not change the dictionary. - expected = dictionary.copy() - - # batch mode - new_dict = dictionary.copy() - _update_dict(new_dict, X, code) + X = np.dot(code, dictionary) + rng.randn(2, 2) - assert_allclose(new_dict, expected) + # batch update + newd_batch = dictionary.copy() + _update_dict(newd_batch, X, code) - # online mode + # online update A = np.dot(code.T, code) B = np.dot(X.T, code) - new_dict = dictionary.copy() - _update_dict(new_dict, X, code, A, B) + newd_online = dictionary.copy() + _update_dict(newd_online, X, code, A, B) - assert_allclose(new_dict, expected) + assert_allclose(newd_batch, newd_online) From 7cb5e6894efce05411238ddf4d39c6cb92e5448a Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 19 Jan 2021 18:01:19 +0100 Subject: [PATCH 6/9] fix constraint on the atom norms for dict learning --- doc/modules/decomposition.rst | 4 ++-- sklearn/decomposition/_dict_learning.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index 7e8e79d9d8bdd..12632d9e8845c 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -247,7 +247,7 @@ problem solved is a PCA problem (dictionary learning) with an .. math:: (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2} ||X-UV||_2^2+\alpha||V||_1 \\ - \text{subject to } & ||U_k||_2 = 1 \text{ for all } + \text{subject to } & ||U_k||_2 <= 1 \text{ for all } 0 \leq k < n_{components} @@ -433,7 +433,7 @@ dictionary fixed, and then updating the dictionary to best fit the sparse code. .. math:: (U^*, V^*) = \underset{U, V}{\operatorname{arg\,min\,}} & \frac{1}{2} ||X-UV||_2^2+\alpha||U||_1 \\ - \text{subject to } & ||V_k||_2 = 1 \text{ for all } + \text{subject to } & ||V_k||_2 <= 1 \text{ for all } 0 \leq k < n_{\mathrm{atoms}} diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 7225b122103b5..eb4d5b7635da1 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -419,8 +419,8 @@ def _update_dict(dictionary, Y, code, A=None, B=None, verbose=False, if positive: np.clip(dictionary[k], 0, None, out=dictionary[k]) - # Projection on the constraint ||V_k|| == 1 - dictionary[k] /= linalg.norm(dictionary[k]) + # Projection on the constraint ||V_k|| <= 1 + dictionary[k] /= max(linalg.norm(dictionary[k]), 1) @_deprecate_positional_args @@ -1120,7 +1120,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator): (U^*,V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1 (U,V) - with || V_k ||_2 = 1 for all 0 <= k < n_components + with || V_k ||_2 <= 1 for all 0 <= k < n_components Read more in the :ref:`User Guide `. @@ -1352,7 +1352,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator): (U^*,V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1 (U,V) - with || V_k ||_2 = 1 for all 0 <= k < n_components + with || V_k ||_2 <= 1 for all 0 <= k < n_components Read more in the :ref:`User Guide `. From 917e5757e48a2aec57c7c9c856a0f3c3c479b2b6 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 15 Apr 2021 14:41:44 +0200 Subject: [PATCH 7/9] cln --- sklearn/decomposition/_dict_learning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 4a97b24af5a7f..6eca2f8d6e661 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -420,8 +420,8 @@ def _update_dict(dictionary, Y, code, A=None, B=None, verbose=False, if positive: np.clip(dictionary[k], 0, None, out=dictionary[k]) - # Projection on the constraint set ||V_k|| == 1 - dictionary[k] /= linalg.norm(dictionary[k]) + # Projection on the constraint set ||V_k|| <= 1 + dictionary[k] /= max(linalg.norm(dictionary[k]), 1) if verbose and n_unused > 0: print(f"{n_unused} unused atoms resampled.") From feea61761299ec06b1191e64cf9d80a11df1a6fc Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 27 Oct 2021 15:07:11 +0200 Subject: [PATCH 8/9] what's new --- doc/whats_new/v1.1.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 261b6f8eac6bf..fa4690d7b0f22 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -74,6 +74,12 @@ Changelog and :class:`~sklearn.decomposition.TruncatedSVD`. :pr:`21334` by `Thomas Fan`_. +- |Fix| Fixed the constraint on the objective function of + :class:`decomposition.DictionaryLearning`, + :class:`decomposition.MiniBatchDictionaryLearning`, :class:`decomposition.SparsePCA` + and :class:`decomposition.MiniBatchSparsePCA` to be convex and match the referenced + article. :pr:`19210` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.impute` ..................... From 7ac2260c87fe976d15a61a307e5ff17bffe61998 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 27 Oct 2021 16:31:32 +0200 Subject: [PATCH 9/9] change version --- doc/whats_new/v1.0.rst | 20 ++++++++++++++++++++ doc/whats_new/v1.1.rst | 6 ------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 17c2b845d40d9..1cc84b74198ce 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -2,6 +2,26 @@ .. currentmodule:: sklearn +.. _changes_1_0_2: + +Version 1.0.2 +============= + +**In Development** + +Changelog +--------- + +:mod:`sklearn.decomposition` +............................ + +- |Fix| Fixed the constraint on the objective function of + :class:`decomposition.DictionaryLearning`, + :class:`decomposition.MiniBatchDictionaryLearning`, :class:`decomposition.SparsePCA` + and :class:`decomposition.MiniBatchSparsePCA` to be convex and match the referenced + article. :pr:`19210` by :user:`Jérémie du Boisberranger `. + + .. _changes_1_0_1: Version 1.0.1 diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index fa4690d7b0f22..261b6f8eac6bf 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -74,12 +74,6 @@ Changelog and :class:`~sklearn.decomposition.TruncatedSVD`. :pr:`21334` by `Thomas Fan`_. -- |Fix| Fixed the constraint on the objective function of - :class:`decomposition.DictionaryLearning`, - :class:`decomposition.MiniBatchDictionaryLearning`, :class:`decomposition.SparsePCA` - and :class:`decomposition.MiniBatchSparsePCA` to be convex and match the referenced - article. :pr:`19210` by :user:`Jérémie du Boisberranger `. - :mod:`sklearn.impute` .....................