From 5119d6898276f8a641517f9667a7b42d5e0ff7f0 Mon Sep 17 00:00:00 2001 From: Manoj-Kumar-S Date: Thu, 24 Apr 2014 17:19:09 +0530 Subject: [PATCH 1/4] ENH: Swap rows in sparsefuncs --- sklearn/utils/sparsefuncs.py | 96 +++++++++++++++++++++++++ sklearn/utils/tests/test_sparsefuncs.py | 31 +++++++- 2 files changed, 126 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index f496c1279bae3..a5e3103a2281c 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -56,3 +56,99 @@ def inplace_column_scale(X, scale): else: raise TypeError( "Unsupported type; expected a CSR or CSC sparse matrix.") + + +def swap_row_csc(X, m, n): + """ + Swaps two rows of a CSC matrix in-place. + + Parameters + ---------- + X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) + m : int, index of first sample + m : int, index of second sample + """ + if m < 0: + m += X.shape[0] + if n < 0: + n += X.shape[0] + + m_mask = X.indices == m + X.indices[X.indices == n] = m + X.indices[m_mask] = n + + +def swap_row_csr(X, m, n): + """ + Swaps two rows of a CSR matrix in-place. + + Parameters + ---------- + X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) + m : int, index of first sample + m : int, index of second sample + """ + if m < 0: + m += X.shape[0] + if n < 0: + n += X.shape[0] + if m > n: + m, n = n, m + + indptr = X.indptr + indices = X.indices.copy() + data = X.data.copy() + + nz_m = indptr[m + 1] - indptr[m] + nz_n = indptr[n + 1] - indptr[n] + m_ptr1 = indptr[m] + m_ptr2 = indptr[m + 1] + n_ptr1 = indptr[n] + n_ptr2 = indptr[n + 1] + + # If non zero rows are equal in mth and nth row, then swapping becomes + # easy. + if nz_m == nz_n: + mask = X.indices[m_ptr1: m_ptr2].copy() + X.indices[m_ptr1: m_ptr2] = X.indices[n_ptr1: n_ptr2] + X.indices[n_ptr1: n_ptr2] = mask + mask = X.data[m_ptr1: m_ptr2].copy() + X.data[m_ptr1: m_ptr2] = X.data[n_ptr1: n_ptr2] + X.data[n_ptr1: n_ptr2] = mask + + else: + # Modify indptr first + X.indptr[m + 2: n] += nz_n - nz_m + X.indptr[m + 1] = X.indptr[m] + nz_n + X.indptr[n] = X.indptr[n + 1] - nz_m + + mask1 = X.indices[m_ptr1: m_ptr2].copy() + mask2 = X.indices[n_ptr1: n_ptr2].copy() + X.indices[m_ptr1: m_ptr1 + nz_n] = mask2 + X.indices[n_ptr2 - nz_m: n_ptr2] = mask1 + X.indices[m_ptr1 + nz_n: n_ptr2 - nz_m] = indices[m_ptr2: n_ptr1] + + mask1 = X.data[m_ptr1: m_ptr2].copy() + mask2 = X.data[n_ptr1: n_ptr2].copy() + X.data[m_ptr1: m_ptr1 + nz_n] = mask2 + X.data[n_ptr2 - nz_m: n_ptr2] = mask1 + X.data[m_ptr1 + nz_n: n_ptr2 - nz_m] = data[m_ptr2: n_ptr1] + + +def swap_row(X, m, n): + """ + Swaps two rows of a CSC/CSR matrix in-place. + + Parameters + ---------- + X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) + m : int, index of first sample + m : int, index of second sample + """ + if isinstance(X, sp.csc_matrix): + return swap_row_csc(X, m, n) + elif isinstance(X, sp.csr_matrix): + return swap_row_csr(X, m, n) + else: + raise TypeError( + "Unsupported type; expected a CSR or CSC sparse matrix.") diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 6305d1b0d47b0..9a8f1559e3d41 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -1,10 +1,13 @@ import numpy as np import scipy.sparse as sp + +from scipy import linalg from numpy.testing import assert_array_almost_equal, assert_array_equal from sklearn.datasets import make_classification from sklearn.utils.sparsefuncs import (mean_variance_axis0, - inplace_column_scale) + inplace_column_scale, + swap_row) from sklearn.utils.sparsefuncs_fast import assign_rows_csr from sklearn.utils.testing import assert_raises @@ -60,3 +63,29 @@ def test_inplace_column_scale(): assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale) + + +def test_swap_row(): + X = np.array([[0, 3, 0], + [2, 4, 0], + [0, 0, 0], + [9, 8, 7], + [4, 0, 5]], dtype=np.float64) + X_csr = sp.csr_matrix(X) + X_csc = sp.csc_matrix(X) + + swap = linalg.get_blas_funcs(('swap',), (X,)) + swap = swap[0] + X[0], X[-1] = swap(X[0], X[-1]) + swap_row(X_csr, 0, -1) + swap_row(X_csc, 0, -1) + assert_array_equal(X_csr.toarray(), X_csc.toarray()) + assert_array_equal(X, X_csc.toarray()) + assert_array_equal(X, X_csr.toarray()) + + X[2], X[3] = swap(X[2], X[3]) + swap_row(X_csr, 2, 3) + swap_row(X_csc, 2, 3) + assert_array_equal(X_csr.toarray(), X_csc.toarray()) + assert_array_equal(X, X_csc.toarray()) + assert_array_equal(X, X_csr.toarray()) From 924db9b07e82f2f76776fc94d99dbbc4d4076db5 Mon Sep 17 00:00:00 2001 From: Manoj-Kumar-S Date: Thu, 24 Apr 2014 20:50:11 +0530 Subject: [PATCH 2/4] Made the following changes a] Replaced numpy slicing with concatanate b] Added swap_sparse_column --- sklearn/utils/sparsefuncs.py | 68 ++++++++++++++----------- sklearn/utils/tests/test_sparsefuncs.py | 28 +++++++++- 2 files changed, 65 insertions(+), 31 deletions(-) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index a5e3103a2281c..69c1708ab2a7e 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -2,6 +2,7 @@ # License: BSD 3 clause import scipy.sparse as sp +import numpy as np from .sparsefuncs_fast import (csr_mean_variance_axis0, csc_mean_variance_axis0, @@ -66,7 +67,7 @@ def swap_row_csc(X, m, n): ---------- X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) m : int, index of first sample - m : int, index of second sample + n : int, index of second sample """ if m < 0: m += X.shape[0] @@ -86,7 +87,7 @@ def swap_row_csr(X, m, n): ---------- X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) m : int, index of first sample - m : int, index of second sample + n : int, index of second sample """ if m < 0: m += X.shape[0] @@ -96,43 +97,27 @@ def swap_row_csr(X, m, n): m, n = n, m indptr = X.indptr - indices = X.indices.copy() - data = X.data.copy() - - nz_m = indptr[m + 1] - indptr[m] - nz_n = indptr[n + 1] - indptr[n] m_ptr1 = indptr[m] m_ptr2 = indptr[m + 1] n_ptr1 = indptr[n] n_ptr2 = indptr[n + 1] + nz_m = m_ptr2 - m_ptr1 + nz_n = n_ptr2 - n_ptr1 - # If non zero rows are equal in mth and nth row, then swapping becomes - # easy. - if nz_m == nz_n: - mask = X.indices[m_ptr1: m_ptr2].copy() - X.indices[m_ptr1: m_ptr2] = X.indices[n_ptr1: n_ptr2] - X.indices[n_ptr1: n_ptr2] = mask - mask = X.data[m_ptr1: m_ptr2].copy() - X.data[m_ptr1: m_ptr2] = X.data[n_ptr1: n_ptr2] - X.data[n_ptr1: n_ptr2] = mask - else: + if nz_m != nz_n: # Modify indptr first - X.indptr[m + 2: n] += nz_n - nz_m + X.indptr[m + 2:n] += nz_n - nz_m X.indptr[m + 1] = X.indptr[m] + nz_n X.indptr[n] = X.indptr[n + 1] - nz_m - mask1 = X.indices[m_ptr1: m_ptr2].copy() - mask2 = X.indices[n_ptr1: n_ptr2].copy() - X.indices[m_ptr1: m_ptr1 + nz_n] = mask2 - X.indices[n_ptr2 - nz_m: n_ptr2] = mask1 - X.indices[m_ptr1 + nz_n: n_ptr2 - nz_m] = indices[m_ptr2: n_ptr1] - - mask1 = X.data[m_ptr1: m_ptr2].copy() - mask2 = X.data[n_ptr1: n_ptr2].copy() - X.data[m_ptr1: m_ptr1 + nz_n] = mask2 - X.data[n_ptr2 - nz_m: n_ptr2] = mask1 - X.data[m_ptr1 + nz_n: n_ptr2 - nz_m] = data[m_ptr2: n_ptr1] + X.indices = np.concatenate([X.indices[:m_ptr1], X.indices[n_ptr1:n_ptr2], + X.indices[m_ptr2:n_ptr1], + X.indices[m_ptr1:m_ptr2], + X.indices[n_ptr2:]]) + X.data = np.concatenate([X.data[:m_ptr1], X.data[n_ptr1:n_ptr2], + X.data[m_ptr2:n_ptr1], X.data[m_ptr1:m_ptr2], + X.data[n_ptr2:]]) def swap_row(X, m, n): @@ -143,7 +128,7 @@ def swap_row(X, m, n): ---------- X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) m : int, index of first sample - m : int, index of second sample + n : int, index of second sample """ if isinstance(X, sp.csc_matrix): return swap_row_csc(X, m, n) @@ -152,3 +137,26 @@ def swap_row(X, m, n): else: raise TypeError( "Unsupported type; expected a CSR or CSC sparse matrix.") + + +def swap_column(X, m, n): + """ + Swaps two columns of a CSC/CSR matrix in-place. + + Parameters + ---------- + X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) + m : int, index of first sample + n : int, index of second sample + """ + if m < 0: + m += X.shape[1] + if n < 0: + n += X.shape[1] + if isinstance(X, sp.csc_matrix): + return swap_row_csr(X, m, n) + elif isinstance(X, sp.csr_matrix): + return swap_row_csc(X, m, n) + else: + raise TypeError( + "Unsupported type; expected a CSR or CSC sparse matrix.") diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 9a8f1559e3d41..3ed04f41bffe4 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -7,7 +7,7 @@ from sklearn.datasets import make_classification from sklearn.utils.sparsefuncs import (mean_variance_axis0, inplace_column_scale, - swap_row) + swap_row, swap_column) from sklearn.utils.sparsefuncs_fast import assign_rows_csr from sklearn.utils.testing import assert_raises @@ -89,3 +89,29 @@ def test_swap_row(): assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) + + +def test_swap_column(): + X = np.array([[0, 3, 0], + [2, 4, 0], + [0, 0, 0], + [9, 8, 7], + [4, 0, 5]], dtype=np.float64) + X_csr = sp.csr_matrix(X) + X_csc = sp.csc_matrix(X) + + swap = linalg.get_blas_funcs(('swap',), (X,)) + swap = swap[0] + X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1]) + swap_column(X_csr, 0, -1) + swap_column(X_csc, 0, -1) + assert_array_equal(X_csr.toarray(), X_csc.toarray()) + assert_array_equal(X, X_csc.toarray()) + assert_array_equal(X, X_csr.toarray()) + + X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1]) + swap_column(X_csr, 0, 1) + swap_column(X_csc, 0, 1) + assert_array_equal(X_csr.toarray(), X_csc.toarray()) + assert_array_equal(X, X_csc.toarray()) + assert_array_equal(X, X_csr.toarray()) From 31d438eb3df7e75f53d4cf895c7a0d91269be399 Mon Sep 17 00:00:00 2001 From: Manoj-Kumar-S Date: Sat, 26 Apr 2014 23:38:58 +0530 Subject: [PATCH 3/4] COSMIT: Replaced ptr1/2 with start/stop --- sklearn/utils/sparsefuncs.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index 69c1708ab2a7e..31b9efb57f7db 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -97,27 +97,30 @@ def swap_row_csr(X, m, n): m, n = n, m indptr = X.indptr - m_ptr1 = indptr[m] - m_ptr2 = indptr[m + 1] - n_ptr1 = indptr[n] - n_ptr2 = indptr[n + 1] - nz_m = m_ptr2 - m_ptr1 - nz_n = n_ptr2 - n_ptr1 + m_start = indptr[m] + m_stop = indptr[m + 1] + n_start = indptr[n] + n_stop = indptr[n + 1] + nz_m = m_stop - m_start + nz_n = n_stop - n_start if nz_m != nz_n: # Modify indptr first X.indptr[m + 2:n] += nz_n - nz_m - X.indptr[m + 1] = X.indptr[m] + nz_n - X.indptr[n] = X.indptr[n + 1] - nz_m - - X.indices = np.concatenate([X.indices[:m_ptr1], X.indices[n_ptr1:n_ptr2], - X.indices[m_ptr2:n_ptr1], - X.indices[m_ptr1:m_ptr2], - X.indices[n_ptr2:]]) - X.data = np.concatenate([X.data[:m_ptr1], X.data[n_ptr1:n_ptr2], - X.data[m_ptr2:n_ptr1], X.data[m_ptr1:m_ptr2], - X.data[n_ptr2:]]) + X.indptr[m + 1] = m_start + nz_n + X.indptr[n] = n_stop - nz_m + + X.indices = np.concatenate([X.indices[:m_start], + X.indices[n_start:n_stop], + X.indices[m_stop:n_start], + X.indices[m_start:m_stop], + X.indices[n_stop:]]) + X.data = np.concatenate([X.data[:m_start], + X.data[n_start:n_stop], + X.data[m_stop:n_start], + X.data[m_start:m_stop], + X.data[n_stop:]]) def swap_row(X, m, n): From fc634f95f249e7cba7e5a219831ea66ce106a071 Mon Sep 17 00:00:00 2001 From: Manoj-Kumar-S Date: Wed, 30 Apr 2014 23:27:26 +0530 Subject: [PATCH 4/4] Made the following changes 1. Minor changes to docs 2. Replaced swap with inplace_swap --- sklearn/utils/sparsefuncs.py | 71 ++++++++++++++++++------- sklearn/utils/tests/test_sparsefuncs.py | 22 ++++---- 2 files changed, 62 insertions(+), 31 deletions(-) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index 31b9efb57f7db..f567db06a04f6 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -59,16 +59,25 @@ def inplace_column_scale(X, scale): "Unsupported type; expected a CSR or CSC sparse matrix.") -def swap_row_csc(X, m, n): +def inplace_swap_row_csc(X, m, n): """ Swaps two rows of a CSC matrix in-place. Parameters ---------- - X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) - m : int, index of first sample - n : int, index of second sample + X: scipy.sparse.csc_matrix, shape=(n_samples, n_features) + Matrix whose two rows are to be swapped. + + m: int + Index of the row of X to be swapped. + + n: int + Index of the row of X to be swapped. """ + for t in [m, n]: + if isinstance(t, np.ndarray): + raise TypeError("m and n should be valid integers") + if m < 0: m += X.shape[0] if n < 0: @@ -79,20 +88,32 @@ def swap_row_csc(X, m, n): X.indices[m_mask] = n -def swap_row_csr(X, m, n): +def inplace_swap_row_csr(X, m, n): """ Swaps two rows of a CSR matrix in-place. Parameters ---------- - X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) - m : int, index of first sample - n : int, index of second sample + X: scipy.sparse.csr_matrix, shape=(n_samples, n_features) + Matrix whose two rows are to be swapped. + + m: int + Index of the row of X to be swapped. + + n: int + Index of the row of X to be swapped. """ + for t in [m, n]: + if isinstance(t, np.ndarray): + raise TypeError("m and n should be valid integers") + if m < 0: m += X.shape[0] if n < 0: n += X.shape[0] + + # The following swapping makes life easier since m is assumed to be the + # smaller integer below. if m > n: m, n = n, m @@ -123,43 +144,53 @@ def swap_row_csr(X, m, n): X.data[n_stop:]]) -def swap_row(X, m, n): +def inplace_swap_row(X, m, n): """ Swaps two rows of a CSC/CSR matrix in-place. Parameters ---------- - X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) - m : int, index of first sample - n : int, index of second sample + X : CSR or CSC sparse matrix, shape=(n_samples, n_features) + Matrix whose two rows are to be swapped. + + m: int + Index of the row of X to be swapped. + + n: int + Index of the row of X to be swapped. """ if isinstance(X, sp.csc_matrix): - return swap_row_csc(X, m, n) + return inplace_swap_row_csc(X, m, n) elif isinstance(X, sp.csr_matrix): - return swap_row_csr(X, m, n) + return inplace_swap_row_csr(X, m, n) else: raise TypeError( "Unsupported type; expected a CSR or CSC sparse matrix.") -def swap_column(X, m, n): +def inplace_swap_column(X, m, n): """ Swaps two columns of a CSC/CSR matrix in-place. Parameters ---------- - X : scipy.sparse.csc_matrix, shape=(n_samples, n_features) - m : int, index of first sample - n : int, index of second sample + X : CSR or CSC sparse matrix, shape=(n_samples, n_features) + Matrix whose two columns are to be swapped. + + m: int + Index of the column of X to be swapped. + + n : int + Index of the column of X to be swapped. """ if m < 0: m += X.shape[1] if n < 0: n += X.shape[1] if isinstance(X, sp.csc_matrix): - return swap_row_csr(X, m, n) + return inplace_swap_row_csr(X, m, n) elif isinstance(X, sp.csr_matrix): - return swap_row_csc(X, m, n) + return inplace_swap_row_csc(X, m, n) else: raise TypeError( "Unsupported type; expected a CSR or CSC sparse matrix.") diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 3ed04f41bffe4..d511f014dabda 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -7,7 +7,7 @@ from sklearn.datasets import make_classification from sklearn.utils.sparsefuncs import (mean_variance_axis0, inplace_column_scale, - swap_row, swap_column) + inplace_swap_row, inplace_swap_column) from sklearn.utils.sparsefuncs_fast import assign_rows_csr from sklearn.utils.testing import assert_raises @@ -65,7 +65,7 @@ def test_inplace_column_scale(): assert_raises(TypeError, inplace_column_scale, X.tolil(), scale) -def test_swap_row(): +def test_inplace_swap_row(): X = np.array([[0, 3, 0], [2, 4, 0], [0, 0, 0], @@ -77,21 +77,21 @@ def test_swap_row(): swap = linalg.get_blas_funcs(('swap',), (X,)) swap = swap[0] X[0], X[-1] = swap(X[0], X[-1]) - swap_row(X_csr, 0, -1) - swap_row(X_csc, 0, -1) + inplace_swap_row(X_csr, 0, -1) + inplace_swap_row(X_csc, 0, -1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) X[2], X[3] = swap(X[2], X[3]) - swap_row(X_csr, 2, 3) - swap_row(X_csc, 2, 3) + inplace_swap_row(X_csr, 2, 3) + inplace_swap_row(X_csc, 2, 3) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) -def test_swap_column(): +def test_inplace_swap_column(): X = np.array([[0, 3, 0], [2, 4, 0], [0, 0, 0], @@ -103,15 +103,15 @@ def test_swap_column(): swap = linalg.get_blas_funcs(('swap',), (X,)) swap = swap[0] X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1]) - swap_column(X_csr, 0, -1) - swap_column(X_csc, 0, -1) + inplace_swap_column(X_csr, 0, -1) + inplace_swap_column(X_csc, 0, -1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1]) - swap_column(X_csr, 0, 1) - swap_column(X_csc, 0, 1) + inplace_swap_column(X_csr, 0, 1) + inplace_swap_column(X_csc, 0, 1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray())