scikit-learn · murata-yu · Dec 16, 2021 · Dec 17, 2021 · Dec 17, 2021 · Dec 17, 2021
diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
@@ -1144,6 +11
8000
44,12 @@ Changelog
 - |API| Fixed several bugs in :func:`utils.graph.graph_shortest_path`, which is
   now deprecated. Use `scipy.sparse.csgraph.shortest_path` instead. :pr:`20531`
   by `Tom Dupre la Tour`_.
+
+- |Enhancement| Added `copy` args to :func:`utils.shuffle`. If `copy=False`,
+  indexable data-structures in `*arrays` are destructed
+  instead of consuming less memory. :func:`utils.shuffle` without copy does not
+  support :class:`utils._mocking.MockDataFrame`. It raises `ValueError`.
+  :pr:`22003` by :user:`murata-yu`.
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
@@ -450,7 +450,9 @@ def _get_column_indices(X, key):
         )
 
 
-def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None):
+def resample(
+    *arrays, replace=True, n_samples=None, random_state=None, stratify=None, copy=True
+):
     """Resample arrays or sparse matrices in a consistent way.
 
     The default strategy implements one step of the bootstrapping
@@ -484,12 +486,19 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=
         If not None, data is split in a stratified fashion, using this as
         the class labels.
 
+    copy : bool, default=True
+        Flag to return the copied array. This frag works if `replace=False`.
+        If `replace=False` and `copy=False`,
+        indexable data-structures in `*arrays` are destructed
+        instead of consuming less memory.
+
     Returns
     -------
     resampled_arrays : sequence of array-like of shape (n_samples,) or \
             (n_samples, n_outputs)
         Sequence of resampled copies of the collections. The original arrays
-        are not impacted.
+        are not impacted if `copy=True`.
+        If `replace=False` and `copy=False`, the original arrays are resampled too.
 
     Examples
     --------
@@ -552,6 +561,15 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=
             % (max_n_samples, n_samples)
         )
 
+    if not copy:
+        if replace:
+            raise ValueError("`copy=False` is valid only when `replace=False`")
+
+        # FIXME: Make sure MockDataFrame also supports `copy=False``
+        for a in arrays:
+            if hasattr(a, "iloc"):
+                raise ValueError("MockDataFrame does not support `copy=False`")
+
     check_consistent_length(*arrays)
 
     if stratify is None:
@@ -592,15 +610,22 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=
 
     # convert sparse matrices to CSR for row-based indexing
        arrays = [a.tocsr() if issparse(a) else a for a in arrays]
-    resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
+
+    if copy:
+        resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
+    else:
+        for a in arrays:
+            a[:] = _safe_indexing(a, indices)
+        resampled_arrays = arrays
+
     if len(resampled_arrays) == 1:
         # syntactic sugar for the unit argument case
         return resampled_arrays[0]
     else:
         return resampled_arrays
 
 
-def shuffle(*arrays, random_state=None, n_samples=None):
+def shuffle(*arrays, random_state=None, n_samples=None, copy=True):
     """Shuffle arrays or sparse matrices in a consistent way.
 
     This is a convenience alias to ``resample(*arrays, replace=False)`` to do
@@ -623,11 +648,17 @@ def shuffle(*arrays, random_state=None, n_samples=None):
         automatically set to the first dimension of the arrays.  It should
         not be larger than the length of arrays.
 
+    copy : bool, default=True
+        Flag to return the copied array. If set False,
+        indexable data-structures in `*arrays` are destructed
+        instead of consuming less memory.
+
     Returns
     -------
     shuffled_arrays : sequence of indexable data-structures
         Sequence of shuffled copies of the collections. The original arrays
-        are not impacted.
+        are not impacted if `copy=True`.
+        If `copy=False`, the original arrays are shuffled too.
 
     Examples
     --------
@@ -666,8 +697,13 @@ def shuffle(*arrays, random_state=None, n_samples=None):
     --------
     resample
     """
+
     return resample(
-        *arrays, replace=False, n_samples=n_samples, random_state=random_state
+        *arrays,
+        replace=False,
+        n_samples=n_samples,
+        random_state=random_state,
+        copy=copy,
     )
 
 

diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
@@ -116,6 +116,14 @@ def test_resample():
         resample([0], [0, 1])
     with pytest.raises(ValueError):
         resample([0, 1], [0, 1], replace=False, n_samples=3)
+    with pytest.raises(ValueError):
+        resample([0], [1], copy=False)
+    with pytest.raises(ValueError):
+        resample(
+            MockDataFrame(np.array([["a", 0], ["b", 1]], dtype=object)),
+            replace=False,
+            copy=False,
+        )
 
     # Issue:6581, n_samples can be more when replace is True (default).
     assert len(resample([1, 2], n_samples=5)) == 5
@@ -150,6 +158,7 @@ def test_resample_stratified_replace():
     X_no_replace, _ = resample(
         X, y, replace=False, n_samples=50, random_state=rng, stratify=y
     )
+
     assert np.unique(X_replace).shape[0] < 50
     assert np.unique(X_no_replace).shape[0] == 50
 
@@ -183,6 +192,14 @@ def test_resample_stratify_sparse_error():
         X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)
 
 
+def test_resample_without_copy():
    rng = np.random.RandomState(0)
+    n_samples = 100
+    X = rng.normal(size=(n_samples, 1))
+
+    assert id(X) == id(resample(X, replace=False, copy=False))
+
+
 def test_safe_mask():
     random_state = check_random_state(0)
     X = random_state.rand(5, 4)
@@ -514,25 +531,31 @@ def test_get_column_indices_pandas_nonunique_columns_error(key):
     assert str(exc_info.value) == err_msg
 
 
-def test_shuffle_on_ndim_equals_three():
+@pytest.mark.parametrize("copy", [True, False])
+def test_shuffle_on_ndim_equals_three(copy):
     def to_tuple(A):  # to make the inner arrays hashable
         return tuple(tuple(tuple(C) for C in B) for B in A)
 
     A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])  # A.shape = (2,2,2)
     S = set(to_tuple(A))
-    shuffle(A)  # shouldn't raise a ValueError for dim = 3
+    shuffle(A.copy(), copy=copy)  # shouldn't raise a ValueError for dim = 3
     assert set(to_tuple(A)) == S
 
 
-def test_shuffle_dont_convert_to_array():
+@pytest.mark.parametrize("copy", [True, False])
+def test_shuffle_dont_convert_to_array(copy):
     # Check that shuffle does not try to convert to numpy arrays with float
     # dtypes can let any indexable datastructure pass-through.
     a = ["a", "b", "c"]
     b = np.array(["a", "b", "c"], dtype=object)
     c = [1, 2, 3]
     d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object))
     e = sp.csc_matrix(np.arange(6).reshape(3, 2))
-    a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
+
+    if copy:
+        a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0, copy=copy)
+    else:
+        a_s, b_s, c_s, e_s = shuffle(a, b, c, e, random_state=0, copy=copy)
 
     assert a_s == ["c", "b", "a"]
     assert type(a_s) == list
@@ -543,12 +566,21 @@ def test_shuffle_dont_convert_to_array():
     assert c_s == [3, 2, 1]
     assert type(c_s) == list
 
-    assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
-    assert type(d_s) == MockDataFrame
+    if copy:
+        assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
+        assert type(d_s) == MockDataFrame
 
     assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))
 
 
+def test_shuffle_without_copy():
+    rng = np.random.RandomState(0)
+    n_samples = 100
+    X = rng.normal(size=(n_samples, 1))
+
+    assert id(X) == id(shuffle(X, copy=False))
+
+
 def test_gen_even_slices():
     # check that gen_even_slices contains all samples
     some_range = range(10)