From faf305b062b34f1e47e7f596e18ae04cb5cb94eb Mon Sep 17 00:00:00 2001 From: murata-yu Date: Thu, 16 Dec 2021 08:26:52 +0000 Subject: [PATCH 1/9] added copy option --- sklearn/utils/__init__.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index c1f7c2e641502..1356398aec61a 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -414,7 +414,8 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, - stratify=None): + stratify=None, + copy=True): """Resample arrays or sparse matrices in a consistent way. The default strategy implements one step of the bootstrapping @@ -554,7 +555,14 @@ def resample(*arrays, # convert sparse matrices to CSR for row-based indexing arrays = [a.tocsr() if issparse(a) else a for a in arrays] - resampled_arrays = [_safe_indexing(a, indices) for a in arrays] + + if copy: + resampled_arrays = [_safe_indexing(a, indices) for a in arrays] + else: + for a in arrays: + a[:] = _safe_indexing(a, indices) + resampled_arrays = arrays + if len(resampled_arrays) == 1: # syntactic sugar for the unit argument case return resampled_arrays[0] @@ -562,7 +570,7 @@ def resample(*arrays, return resampled_arrays -def shuffle(*arrays, random_state=None, n_samples=None): +def shuffle(*arrays, random_state=None, n_samples=None, copy=True): """Shuffle arrays or sparse matrices in a consistent way. This is a convenience alias to ``resample(*arrays, replace=False)`` to do @@ -628,7 +636,7 @@ def shuffle(*arrays, random_state=None, n_samples=None): resample """ return resample(*arrays, replace=False, n_samples=n_samples, - random_state=random_state) + random_state=random_state, copy=copy) def safe_sqr(X, *, copy=True): From 998bfbd6e314b18ab8191e75fe1f3be8cbe55da6 Mon Sep 17 00:00:00 2001 From: murata-yu Date: Fri, 17 Dec 2021 00:21:35 +0000 Subject: [PATCH 2/9] fixed docs to validate copy when replace=False --- sklearn/utils/__init__.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 1356398aec61a..948e0e7430aac 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -449,12 +449,19 @@ def resample(*arrays, If not None, data is split in a stratified fashion, using this as the class labels. + copy : bool, default=True + Flag to return the copied array. This frag works if `replace=False`. + If `replace=False` and `copy=False`, + indexable data-structures in *arrays are destructed + instead of consuming less memory. + Returns ------- resampled_arrays : sequence of array-like of shape (n_samples,) or \ (n_samples, n_outputs) Sequence of resampled copies of the collections. The original arrays - are not impacted. + are not impacted if `copy=True`. + If `replace=False` and `copy=False`, the original arrays are resampled too. Examples -------- @@ -515,6 +522,15 @@ def resample(*arrays, "when replace is False" % (max_n_samples, n_samples)) + if not copy: + if replace: + raise ValueError("`copy=False` is valid only when `replace=False`") + + # FIXME: Make sure MockDataFrame also supports `copy=False`` + for a in arrays: + if hasattr(a, 'iloc'): + raise ValueError("MockDataFrame does not support `copy=False`") + check_consistent_length(*arrays) if stratify is None: @@ -593,11 +609,17 @@ def shuffle(*arrays, random_state=None, n_samples=None, copy=True): automatically set to the first dimension of the arrays. It should not be larger than the length of arrays. + copy : bool, default=True + Flag to return the copied array. If set False, + indexable data-structures in *arrays are destructed + instead of consuming less memory. + Returns ------- shuffled_arrays : sequence of indexable data-structures Sequence of shuffled copies of the collections. The original arrays - are not impacted. + are not impacted if `copy=True`. + If `copy=False`, the original arrays are shuffled too. Examples -------- From 8ebd05913912ee24079cc8113489df8adbcc1e37 Mon Sep 17 00:00:00 2001 From: murata-yu Date: Fri, 17 Dec 2021 00:27:49 +0000 Subject: [PATCH 3/9] added test for shuffle without copy --- sklearn/utils/tests/test_utils.py | 59 +++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 44e448841cef0..85914c472d4ee 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -115,6 +115,11 @@ def test_resample(): resample([0], [0, 1]) with pytest.raises(ValueError): resample([0, 1], [0, 1], replace=False, n_samples=3) + with pytest.raises(ValueError): + resample([0], [1], copy=False) + with pytest.raises(ValueError): + resample(MockDataFrame(np.array([['a', 0], ['b', 1]], dtype=object)), + replace=False, copy=False) # Issue:6581, n_samples can be more when replace is True (default). assert len(resample([1, 2], n_samples=5)) == 5 @@ -144,17 +149,17 @@ def test_resample_stratified_replace(): X = rng.normal(size=(n_samples, 1)) y = rng.randint(0, 2, size=n_samples) - X_replace, _ = resample(X, y, replace=True, n_samples=50, - random_state=rng, stratify=y) - X_no_replace, _ = resample(X, y, replace=False, n_samples=50, - random_state=rng, stratify=y) + X_replace, _ = resample(X.copy(), y, replace=True, n_samples=50, + random_state=rng, stratify=y, copy=copy) + X_no_replace, _ = resample(X.copy(), y, replace=False, n_samples=50, + random_state=rng, stratify=y, copy=copy) assert np.unique(X_replace).shape[0] < 50 assert np.unique(X_no_replace).shape[0] == 50 # make sure n_samples can be greater than X.shape[0] if we sample with # replacement - X_replace, _ = resample(X, y, replace=True, n_samples=1000, - random_state=rng, stratify=y) + X_replace, _ = resample(X.copy(), y, replace=True, n_samples=1000, + random_state=rng, stratify=y, copy=copy) assert X_replace.shape[0] == 1000 assert np.unique(X_replace).shape[0] == 100 @@ -181,6 +186,14 @@ def test_resample_stratify_sparse_error(): stratify=stratify) +def test_resample_without_copy(): + rng = np.random.RandomState(0) + n_samples = 100 + X = rng.normal(size=(n_samples, 1)) + + assert id(X) == id(resample(X, replace=False, copy=False)) + + def test_safe_mask(): random_state = check_random_state(0) X = random_state.rand(5, 4) @@ -504,17 +517,19 @@ def test_get_column_indices_pandas_nonunique_columns_error(key): assert str(exc_info.value) == err_msg -def test_shuffle_on_ndim_equals_three(): +@pytest.mark.parametrize("copy", [True, False]) +def test_shuffle_on_ndim_equals_three(copy): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2) S = set(to_tuple(A)) - shuffle(A) # shouldn't raise a ValueError for dim = 3 + shuffle(A.copy(), copy=copy) # shouldn't raise a ValueError for dim = 3 assert set(to_tuple(A)) == S -def test_shuffle_dont_convert_to_array(): +@pytest.mark.parametrize("copy", [True, False]) +def test_shuffle_dont_convert_to_array(copy): # Check that shuffle does not try to convert to numpy arrays with float # dtypes can let any indexable datastructure pass-through. a = ['a', 'b', 'c'] @@ -525,7 +540,12 @@ def test_shuffle_dont_convert_to_array(): ['c', 2]], dtype=object)) e = sp.csc_matrix(np.arange(6).reshape(3, 2)) - a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0) + + if copy: + a_s, b_s, c_s, d_s, e_s = shuffle( + a, b, c, d, e, random_state=0, copy=copy) + else: + a_s, b_s, c_s, e_s = shuffle(a, b, c, e, random_state=0, copy=copy) assert a_s == ['c', 'b', 'a'] assert type(a_s) == list @@ -536,17 +556,26 @@ def test_shuffle_dont_convert_to_array(): assert c_s == [3, 2, 1] assert type(c_s) == list - assert_array_equal(d_s, np.array([['c', 2], - ['b', 1], - ['a', 0]], - dtype=object)) - assert type(d_s) == MockDataFrame + if copy: + assert_array_equal(d_s, np.array([['c', 2], + ['b', 1], + ['a', 0]], + dtype=object)) + assert type(d_s) == MockDataFrame assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]])) +def test_shuffle_without_copy(): + rng = np.random.RandomState(0) + n_samples = 100 + X = rng.normal(size=(n_samples, 1)) + + assert id(X) == id(shuffle(X, copy=False)) + + def test_gen_even_slices(): # check that gen_even_slices contains all samples some_range = range(10) From 719becdd3600aba9ace725b5975f9c7a6f44b39d Mon Sep 17 00:00:00 2001 From: murata-yu Date: Fri, 17 Dec 2021 01:50:21 +0000 Subject: [PATCH 4/9] added doc for shuffle --- doc/whats_new/v1.0.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 29a4bce98ecb0..1662e71697f9b 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -501,6 +501,12 @@ Changelog precision of the computed variance was very poor when the real variance is exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger `. +- |Enhancement| Added `copy` args to :func:`utils.shuffle`. If `copy=False`, + indexable data-structures in `*arrays` are destructed + instead of consuming less memory. :func:`utils.shuffle` without copy does not + support :class:`utils._mocking.MockDataFrame`. It raises `ValueError`. + :pr:`` by :user:`murata-yu`. + Code and Documentation Contributors ----------------------------------- From 274d508fed6e4a77d3a85f5174323fd58868c199 Mon Sep 17 00:00:00 2001 From: murata-yu Date: Fri, 17 Dec 2021 02:20:39 +0000 Subject: [PATCH 5/9] add pr number --- doc/whats_new/v1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 1662e71697f9b..7e06713c833ec 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -505,7 +505,7 @@ Changelog indexable data-structures in `*arrays` are destructed instead of consuming less memory. :func:`utils.shuffle` without copy does not support :class:`utils._mocking.MockDataFrame`. It raises `ValueError`. - :pr:`` by :user:`murata-yu`. + :pr:`22003` by :user:`murata-yu`. Code and Documentation Contributors ----------------------------------- From 38fed2e68e1ecd8f23dc829f2bfc1b849c5dc47a Mon Sep 17 00:00:00 2001 From: murata-yu Date: Fri, 17 Dec 2021 02:24:39 +0000 Subject: [PATCH 6/9] removed unnecessary copy args from resample test --- sklearn/utils/tests/test_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 85914c472d4ee..dd9fd584ad6f3 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -149,17 +149,17 @@ def test_resample_stratified_replace(): X = rng.normal(size=(n_samples, 1)) y = rng.randint(0, 2, size=n_samples) - X_replace, _ = resample(X.copy(), y, replace=True, n_samples=50, - random_state=rng, stratify=y, copy=copy) - X_no_replace, _ = resample(X.copy(), y, replace=False, n_samples=50, - random_state=rng, stratify=y, copy=copy) + X_replace, _ = resample(X, y, replace=True, n_samples=50, + random_state=rng, stratify=y) + X_no_replace, _ = resample(X, y, replace=False, n_samples=50, + random_state=rng, stratify=y, ) assert np.unique(X_replace).shape[0] < 50 assert np.unique(X_no_replace).shape[0] == 50 # make sure n_samples can be greater than X.shape[0] if we sample with # replacement - X_replace, _ = resample(X.copy(), y, replace=True, n_samples=1000, - random_state=rng, stratify=y, copy=copy) + X_replace, _ = resample(X, y, replace=True, n_samples=1000, + random_state=rng, stratify=y) assert X_replace.shape[0] == 1000 assert np.unique(X_replace).shape[0] == 100 From 6f7aadfd40104f8ba60ec5e5c1e26ab46f54a6ea Mon Sep 17 00:00:00 2001 From: murata-yu Date: Fri, 17 Dec 2021 02:39:37 +0000 Subject: [PATCH 7/9] formatting --- sklearn/utils/__init__.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 53782557bd881..ba08982ed4ef1 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -450,7 +450,9 @@ def _get_column_indices(X, key): ) -def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None, copy=True): +def resample( + *arrays, replace=True, n_samples=None, random_state=None, stratify=None, copy=True +): """Resample arrays or sparse matrices in a consistent way. The default strategy implements one step of the bootstrapping @@ -565,7 +567,7 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify= # FIXME: Make sure MockDataFrame also supports `copy=False`` for a in arrays: - if hasattr(a, 'iloc'): + if hasattr(a, "iloc"): raise ValueError("MockDataFrame does not support `copy=False`") check_consistent_length(*arrays) @@ -695,9 +697,13 @@ def shuffle(*arrays, random_state=None, n_samples=None, copy=True): -------- resample """ - + return resample( - *arrays, replace=False, n_samples=n_samples, random_state=random_state, copy=copy + *arrays, + replace=False, + n_samples=n_samples, + random_state=random_state, + copy=copy, ) From 7dc284e43513090b76bdb675d46486bcdfc6a75c Mon Sep 17 00:00:00 2001 From: murata-yu Date: Fri, 17 Dec 2021 04:08:31 +0000 Subject: [PATCH 8/9] fixed to use inline codebock --- sklearn/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index ba08982ed4ef1..0d2126ac899f0 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -489,7 +489,7 @@ def resample( copy : bool, default=True Flag to return the copied array. This frag works if `replace=False`. If `replace=False` and `copy=False`, - indexable data-structures in *arrays are destructed + indexable data-structures in `*arrays` are destructed instead of consuming less memory. Returns @@ -650,7 +650,7 @@ def shuffle(*arrays, random_state=None, n_samples=None, copy=True): copy : bool, default=True Flag to return the copied array. If set False, - indexable data-structures in *arrays are destructed + indexable data-structures in `*arrays` are destructed instead of consuming less memory. Returns From fd330eb2ecdb7e694ff777626389dd7828777f39 Mon Sep 17 00:00:00 2001 From: murata-yu Date: Fri, 17 Dec 2021 06:03:45 +0000 Subject: [PATCH 9/9] format according to black --- sklearn/utils/tests/test_utils.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index fc913bc63b3b3..60617f0dc516a 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -119,8 +119,11 @@ def test_resample(): with pytest.raises(ValueError): resample([0], [1], copy=False) with pytest.raises(ValueError): - resample(MockDataFrame(np.array([['a', 0], ['b', 1]], dtype=object)), - replace=False, copy=False) + resample( + MockDataFrame(np.array([["a", 0], ["b", 1]], dtype=object)), + replace=False, + copy=False, + ) # Issue:6581, n_samples can be more when replace is True (default). assert len(resample([1, 2], n_samples=5)) == 5 @@ -530,7 +533,7 @@ def test_get_column_indices_pandas_nonunique_columns_error(key): @pytest.mark.parametrize("copy", [True, False]) def test_shuffle_on_ndim_equals_three(copy): - def to_tuple(A): # to make the inner arrays hashable + def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2) @@ -550,10 +553,9 @@ def test_shuffle_dont_convert_to_array(copy): e = sp.csc_matrix(np.arange(6).reshape(3, 2)) if copy: - a_s, b_s, c_s, d_s, e_s = shuffle( - a, b, c, d, e, random_state=0, copy=copy) + a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0, copy=copy) else: - a_s, b_s, c_s, e_s = shuffle(a, b, c, e, random_state=0, copy=copy) + a_s, b_s, c_s, e_s = shuffle(a, b, c, e, random_state=0, copy=copy) assert a_s == ["c", "b", "a"] assert type(a_s) == list @@ -565,10 +567,7 @@ def test_shuffle_dont_convert_to_array(copy): assert type(c_s) == list if copy: - assert_array_equal(d_s, np.array([['c', 2], - ['b', 1], - ['a', 0]], - dtype=object)) + assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object)) assert type(d_s) == MockDataFrame assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))