From faf305b062b34f1e47e7f596e18ae04cb5cb94eb Mon Sep 17 00:00:00 2001
From: murata-yu <murata.yu@jp.fujitsu.com>
Date: Thu, 16 Dec 2021 08:26:52 +0000
Subject: [PATCH 1/9] added copy option

---
 sklearn/utils/__init__.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index c1f7c2e641502..1356398aec61a 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -414,7 +414,8 @@ def resample(*arrays,
              replace=True,
              n_samples=None,
              random_state=None,
-             stratify=None):
+             stratify=None,
+             copy=True):
     """Resample arrays or sparse matrices in a consistent way.
 
     The default strategy implements one step of the bootstrapping
@@ -554,7 +555,14 @@ def resample(*arrays,
 
     # convert sparse matrices to CSR for row-based indexing
     arrays = [a.tocsr() if issparse(a) else a for a in arrays]
-    resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
+
+    if copy:
+        resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
+    else:
+        for a in arrays:
+            a[:] = _safe_indexing(a, indices)
+        resampled_arrays = arrays
+
     if len(resampled_arrays) == 1:
         # syntactic sugar for the unit argument case
         return resampled_arrays[0]
@@ -562,7 +570,7 @@ def resample(*arrays,
         return resampled_arrays
 
 
-def shuffle(*arrays, random_state=None, n_samples=None):
+def shuffle(*arrays, random_state=None, n_samples=None, copy=True):
     """Shuffle arrays or sparse matrices in a consistent way.
 
     This is a convenience alias to ``resample(*arrays, replace=False)`` to do
@@ -628,7 +636,7 @@ def shuffle(*arrays, random_state=None, n_samples=None):
     resample
     """
     return resample(*arrays, replace=False, n_samples=n_samples,
-                    random_state=random_state)
+                    random_state=random_state, copy=copy)
 
 
 def safe_sqr(X, *, copy=True):

From 998bfbd6e314b18ab8191e75fe1f3be8cbe55da6 Mon Sep 17 00:00:00 2001
From: murata-yu <murata.yu@jp.fujitsu.com>
Date: Fri, 17 Dec 2021 00:21:35 +0000
Subject: [PATCH 2/9] fixed docs to validate copy when replace=False

---
 sklearn/utils/__init__.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 1356398aec61a..948e0e7430aac 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -449,12 +449,19 @@ def resample(*arrays,
         If not None, data is split in a stratified fashion, using this as
         the class labels.
 
+    copy : bool, default=True
+        Flag to return the copied array. This frag works if `replace=False`.
+        If `replace=False` and `copy=False`,
+        indexable data-structures in *arrays are destructed
+        instead of consuming less memory.
+
     Returns
     -------
     resampled_arrays : sequence of array-like of shape (n_samples,) or \
             (n_samples, n_outputs)
         Sequence of resampled copies of the collections. The original arrays
-        are not impacted.
+        are not impacted if `copy=True`.
+        If `replace=False` and `copy=False`, the original arrays are resampled too.
 
     Examples
     --------
@@ -515,6 +522,15 @@ def resample(*arrays,
                          "when replace is False" % (max_n_samples,
                                                     n_samples))
 
+    if not copy:
+        if replace:
+            raise ValueError("`copy=False` is valid only when `replace=False`")
+
+        # FIXME: Make sure MockDataFrame also supports `copy=False``
+        for a in arrays:
+            if hasattr(a, 'iloc'):
+                raise ValueError("MockDataFrame does not support `copy=False`")
+
     check_consistent_length(*arrays)
 
     if stratify is None:
@@ -593,11 +609,17 @@ def shuffle(*arrays, random_state=None, n_samples=None, copy=True):
         automatically set to the first dimension of the arrays.  It should
         not be larger than the length of arrays.
 
+    copy : bool, default=True
+        Flag to return the copied array. If set False,
+        indexable data-structures in *arrays are destructed
+        instead of consuming less memory.
+
     Returns
     -------
     shuffled_arrays : sequence of indexable data-structures
         Sequence of shuffled copies of the collections. The original arrays
-        are not impacted.
+        are not impacted if `copy=True`.
+        If `copy=False`, the original arrays are shuffled too.
 
     Examples
     --------

From 8ebd05913912ee24079cc8113489df8adbcc1e37 Mon Sep 17 00:00:00 2001
From: murata-yu <murata.yu@jp.fujitsu.com>
Date: Fri, 17 Dec 2021 00:27:49 +0000
Subject: [PATCH 3/9] added test for shuffle without copy

---
 sklearn/utils/tests/test_utils.py | 59 +++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
index 44e448841cef0..85914c472d4ee 100644
--- a/sklearn/utils/tests/test_utils.py
+++ b/sklearn/utils/tests/test_utils.py
@@ -115,6 +115,11 @@ def test_resample():
         resample([0], [0, 1])
     with pytest.raises(ValueError):
         resample([0, 1], [0, 1], replace=False, n_samples=3)
+    with pytest.raises(ValueError):
+        resample([0], [1], copy=False)
+    with pytest.raises(ValueError):
+        resample(MockDataFrame(np.array([['a', 0], ['b', 1]], dtype=object)),
+                 replace=False, copy=False)
 
     # Issue:6581, n_samples can be more when replace is True (default).
     assert len(resample([1, 2], n_samples=5)) == 5
@@ -144,17 +149,17 @@ def test_resample_stratified_replace():
     X = rng.normal(size=(n_samples, 1))
     y = rng.randint(0, 2, size=n_samples)
 
-    X_replace, _ = resample(X, y, replace=True, n_samples=50,
-                            random_state=rng, stratify=y)
-    X_no_replace, _ = resample(X, y, replace=False, n_samples=50,
-                               random_state=rng, stratify=y)
+    X_replace, _ = resample(X.copy(), y, replace=True, n_samples=50,
+                            random_state=rng, stratify=y, copy=copy)
+    X_no_replace, _ = resample(X.copy(), y, replace=False, n_samples=50,
+                               random_state=rng, stratify=y, copy=copy)
     assert np.unique(X_replace).shape[0] < 50
     assert np.unique(X_no_replace).shape[0] == 50
 
     # make sure n_samples can be greater than X.shape[0] if we sample with
     # replacement
-    X_replace, _ = resample(X, y, replace=True, n_samples=1000,
-                            random_state=rng, stratify=y)
+    X_replace, _ = resample(X.copy(), y, replace=True, n_samples=1000,
+                            random_state=rng, stratify=y, copy=copy)
     assert X_replace.shape[0] == 1000
     assert np.unique(X_replace).shape[0] == 100
 
@@ -181,6 +186,14 @@ def test_resample_stratify_sparse_error():
                         stratify=stratify)
 
 
+def test_resample_without_copy():
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X = rng.normal(size=(n_samples, 1))
+
+    assert id(X) == id(resample(X, replace=False, copy=False))
+
+
 def test_safe_mask():
     random_state = check_random_state(0)
     X = random_state.rand(5, 4)
@@ -504,17 +517,19 @@ def test_get_column_indices_pandas_nonunique_columns_error(key):
     assert str(exc_info.value) == err_msg
 
 
-def test_shuffle_on_ndim_equals_three():
+@pytest.mark.parametrize("copy", [True, False])
+def test_shuffle_on_ndim_equals_three(copy):
     def to_tuple(A):    # to make the inner arrays hashable
         return tuple(tuple(tuple(C) for C in B) for B in A)
 
     A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])  # A.shape = (2,2,2)
     S = set(to_tuple(A))
-    shuffle(A)  # shouldn't raise a ValueError for dim = 3
+    shuffle(A.copy(), copy=copy)  # shouldn't raise a ValueError for dim = 3
     assert set(to_tuple(A)) == S
 
 
-def test_shuffle_dont_convert_to_array():
+@pytest.mark.parametrize("copy", [True, False])
+def test_shuffle_dont_convert_to_array(copy):
     # Check that shuffle does not try to convert to numpy arrays with float
     # dtypes can let any indexable datastructure pass-through.
     a = ['a', 'b', 'c']
@@ -525,7 +540,12 @@ def test_shuffle_dont_convert_to_array():
                                 ['c', 2]],
                       dtype=object))
     e = sp.csc_matrix(np.arange(6).reshape(3, 2))
-    a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
+
+    if copy:
+        a_s, b_s, c_s, d_s, e_s = shuffle(
+            a, b, c, d, e, random_state=0, copy=copy)
+    else:
+        a_s, b_s, c_s, e_s = shuffle(a, b, c,  e, random_state=0, copy=copy)
 
     assert a_s == ['c', 'b', 'a']
     assert type(a_s) == list
@@ -536,17 +556,26 @@ def test_shuffle_dont_convert_to_array():
     assert c_s == [3, 2, 1]
     assert type(c_s) == list
 
-    assert_array_equal(d_s, np.array([['c', 2],
-                                      ['b', 1],
-                                      ['a', 0]],
-                                     dtype=object))
-    assert type(d_s) == MockDataFrame
+    if copy:
+        assert_array_equal(d_s, np.array([['c', 2],
+                                          ['b', 1],
+                                          ['a', 0]],
+                                         dtype=object))
+        assert type(d_s) == MockDataFrame
 
     assert_array_equal(e_s.toarray(), np.array([[4, 5],
                                                 [2, 3],
                                                 [0, 1]]))
 
 
+def test_shuffle_without_copy():
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X = rng.normal(size=(n_samples, 1))
+
+    assert id(X) == id(shuffle(X, copy=False))
+
+
 def test_gen_even_slices():
     # check that gen_even_slices contains all samples
     some_range = range(10)

From 719becdd3600aba9ace725b5975f9c7a6f44b39d Mon Sep 17 00:00:00 2001
From: murata-yu <murata.yu@jp.fujitsu.com>
Date: Fri, 17 Dec 2021 01:50:21 +0000
Subject: [PATCH 4/9] added doc for shuffle

---
 doc/whats_new/v1.0.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 29a4bce98ecb0..1662e71697f9b 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -501,6 +501,12 @@ Changelog
   precision of the computed variance was very poor when the real variance is
   exactly zero. :pr:`19766` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Enhancement| Added `copy` args to :func:`utils.shuffle`. If `copy=False`,
+  indexable data-structures in `*arrays` are destructed
+  instead of consuming less memory. :func:`utils.shuffle` without copy does not
+  support :class:`utils._mocking.MockDataFrame`. It raises `ValueError`.
+  :pr:`<dummy>` by :user:`murata-yu`.
+
 Code and Documentation Contributors
 -----------------------------------
 

From 274d508fed6e4a77d3a85f5174323fd58868c199 Mon Sep 17 00:00:00 2001
From: murata-yu <murata.yu@jp.fujitsu.com>
Date: Fri, 17 Dec 2021 02:20:39 +0000
Subject: [PATCH 5/9] add pr number

---
 doc/whats_new/v1.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 1662e71697f9b..7e06713c833ec 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -505,7 +505,7 @@ Changelog
   indexable data-structures in `*arrays` are destructed
   instead of consuming less memory. :func:`utils.shuffle` without copy does not
   support :class:`utils._mocking.MockDataFrame`. It raises `ValueError`.
-  :pr:`<dummy>` by :user:`murata-yu`.
+  :pr:`22003` by :user:`murata-yu`.
 
 Code and Documentation Contributors
 -----------------------------------

From 38fed2e68e1ecd8f23dc829f2bfc1b849c5dc47a Mon Sep 17 00:00:00 2001
From: murata-yu <murata.yu@jp.fujitsu.com>
Date: Fri, 17 Dec 2021 02:24:39 +0000
Subject: [PATCH 6/9] removed unnecessary copy args from resample test

---
 sklearn/utils/tests/test_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
index 85914c472d4ee..dd9fd584ad6f3 100644
--- a/sklearn/utils/tests/test_utils.py
+++ b/sklearn/utils/tests/test_utils.py
@@ -149,17 +149,17 @@ def test_resample_stratified_replace():
     X = rng.normal(size=(n_samples, 1))
     y = rng.randint(0, 2, size=n_samples)
 
-    X_replace, _ = resample(X.copy(), y, replace=True, n_samples=50,
-                            random_state=rng, stratify=y, copy=copy)
-    X_no_replace, _ = resample(X.copy(), y, replace=False, n_samples=50,
-                               random_state=rng, stratify=y, copy=copy)
+    X_replace, _ = resample(X, y, replace=True, n_samples=50,
+                            random_state=rng, stratify=y)
+    X_no_replace, _ = resample(X, y, replace=False, n_samples=50,
+                               random_state=rng, stratify=y, )
     assert np.unique(X_replace).shape[0] < 50
     assert np.unique(X_no_replace).shape[0] == 50
 
     # make sure n_samples can be greater than X.shape[0] if we sample with
     # replacement
-    X_replace, _ = resample(X.copy(), y, replace=True, n_samples=1000,
-                            random_state=rng, stratify=y, copy=copy)
+    X_replace, _ = resample(X, y, replace=True, n_samples=1000,
+                            random_state=rng, stratify=y)
     assert X_replace.shape[0] == 1000
     assert np.unique(X_replace).shape[0] == 100
 

From 6f7aadfd40104f8ba60ec5e5c1e26ab46f54a6ea Mon Sep 17 00:00:00 2001
From: murata-yu <murata.yu@jp.fujitsu.com>
Date: Fri, 17 Dec 2021 02:39:37 +0000
Subject: [PATCH 7/9] formatting

---
 sklearn/utils/__init__.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 53782557bd881..ba08982ed4ef1 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -450,7 +450,9 @@ def _get_column_indices(X, key):
         )
 
 
-def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None, copy=True):
+def resample(
+    *arrays, replace=True, n_samples=None, random_state=None, stratify=None, copy=True
+):
     """Resample arrays or sparse matrices in a consistent way.
 
     The default strategy implements one step of the bootstrapping
@@ -565,7 +567,7 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=
 
         # FIXME: Make sure MockDataFrame also supports `copy=False``
         for a in arrays:
-            if hasattr(a, 'iloc'):
+            if hasattr(a, "iloc"):
                 raise ValueError("MockDataFrame does not support `copy=False`")
 
     check_consistent_length(*arrays)
@@ -695,9 +697,13 @@ def shuffle(*arrays, random_state=None, n_samples=None, copy=True):
     --------
     resample
     """
-    
+
     return resample(
-        *arrays, replace=False, n_samples=n_samples, random_state=random_state, copy=copy
+        *arrays,
+        replace=False,
+        n_samples=n_samples,
+        random_state=random_state,
+        copy=copy,
     )
 
 

From 7dc284e43513090b76bdb675d46486bcdfc6a75c Mon Sep 17 00:00:00 2001
From: murata-yu <murata.yu@jp.fujitsu.com>
Date: Fri, 17 Dec 2021 04:08:31 +0000
Subject: [PATCH 8/9] fixed to use inline codebock

---
 sklearn/utils/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index ba08982ed4ef1..0d2126ac899f0 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -489,7 +489,7 @@ def resample(
     copy : bool, default=True
         Flag to return the copied array. This frag works if `replace=False`.
         If `replace=False` and `copy=False`,
-        indexable data-structures in *arrays are destructed
+        indexable data-structures in `*arrays` are destructed
         instead of consuming less memory.
 
     Returns
@@ -650,7 +650,7 @@ def shuffle(*arrays, random_state=None, n_samples=None, copy=True):
 
     copy : bool, default=True
         Flag to return the copied array. If set False,
-        indexable data-structures in *arrays are destructed
+        indexable data-structures in `*arrays` are destructed
         instead of consuming less memory.
 
     Returns

From fd330eb2ecdb7e694ff777626389dd7828777f39 Mon Sep 17 00:00:00 2001
From: murata-yu <murata.yu@jp.fujitsu.com>
Date: Fri, 17 Dec 2021 06:03:45 +0000
Subject: [PATCH 9/9] format according to black

---
 sklearn/utils/tests/test_utils.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
index fc913bc63b3b3..60617f0dc516a 100644
--- a/sklearn/utils/tests/test_utils.py
+++ b/sklearn/utils/tests/test_utils.py
@@ -119,8 +119,11 @@ def test_resample():
     with pytest.raises(ValueError):
         resample([0], [1], copy=False)
     with pytest.raises(ValueError):
-        resample(MockDataFrame(np.array([['a', 0], ['b', 1]], dtype=object)),
-                 replace=False, copy=False)
+        resample(
+            MockDataFrame(np.array([["a", 0], ["b", 1]], dtype=object)),
+            replace=False,
+            copy=False,
+        )
 
     # Issue:6581, n_samples can be more when replace is True (default).
     assert len(resample([1, 2], n_samples=5)) == 5
@@ -530,7 +533,7 @@ def test_get_column_indices_pandas_nonunique_columns_error(key):
 
 @pytest.mark.parametrize("copy", [True, False])
 def test_shuffle_on_ndim_equals_three(copy):
-    def to_tuple(A):    # to make the inner arrays hashable
+    def to_tuple(A):  # to make the inner arrays hashable
         return tuple(tuple(tuple(C) for C in B) for B in A)
 
     A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])  # A.shape = (2,2,2)
@@ -550,10 +553,9 @@ def test_shuffle_dont_convert_to_array(copy):
     e = sp.csc_matrix(np.arange(6).reshape(3, 2))
 
     if copy:
-        a_s, b_s, c_s, d_s, e_s = shuffle(
-            a, b, c, d, e, random_state=0, copy=copy)
+        a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0, copy=copy)
     else:
-        a_s, b_s, c_s, e_s = shuffle(a, b, c,  e, random_state=0, copy=copy)
+        a_s, b_s, c_s, e_s = shuffle(a, b, c, e, random_state=0, copy=copy)
 
     assert a_s == ["c", "b", "a"]
     assert type(a_s) == list
@@ -565,10 +567,7 @@ def test_shuffle_dont_convert_to_array(copy):
     assert type(c_s) == list
 
     if copy:
-        assert_array_equal(d_s, np.array([['c', 2],
-                                          ['b', 1],
-                                          ['a', 0]],
-                                         dtype=object))
+        assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
         assert type(d_s) == MockDataFrame
 
     assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))