diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index ce9069a6e8671..5e016668fd5bb 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -1144,6 +1144,12 @@ Changelog - |API| Fixed several bugs in :func:`utils.graph.graph_shortest_path`, which is now deprecated. Use `scipy.sparse.csgraph.shortest_path` instead. :pr:`20531` by `Tom Dupre la Tour`_. + +- |Enhancement| Added `copy` args to :func:`utils.shuffle`. If `copy=False`, + indexable data-structures in `*arrays` are destructed + instead of consuming less memory. :func:`utils.shuffle` without copy does not + support :class:`utils._mocking.MockDataFrame`. It raises `ValueError`. + :pr:`22003` by :user:`murata-yu`. Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 3d8a1ca87d210..0d2126ac899f0 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -450,7 +450,9 @@ def _get_column_indices(X, key): ) -def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None): +def resample( + *arrays, replace=True, n_samples=None, random_state=None, stratify=None, copy=True +): """Resample arrays or sparse matrices in a consistent way. The default strategy implements one step of the bootstrapping @@ -484,12 +486,19 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify= If not None, data is split in a stratified fashion, using this as the class labels. + copy : bool, default=True + Flag to return the copied array. This frag works if `replace=False`. + If `replace=False` and `copy=False`, + indexable data-structures in `*arrays` are destructed + instead of consuming less memory. + Returns ------- resampled_arrays : sequence of array-like of shape (n_samples,) or \ (n_samples, n_outputs) Sequence of resampled copies of the collections. The original arrays - are not impacted. + are not impacted if `copy=True`. + If `replace=False` and `copy=False`, the original arrays are resampled too. Examples -------- @@ -552,6 +561,15 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify= % (max_n_samples, n_samples) ) + if not copy: + if replace: + raise ValueError("`copy=False` is valid only when `replace=False`") + + # FIXME: Make sure MockDataFrame also supports `copy=False`` + for a in arrays: + if hasattr(a, "iloc"): + raise ValueError("MockDataFrame does not support `copy=False`") + check_consistent_length(*arrays) if stratify is None: @@ -592,7 +610,14 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify= # convert sparse matrices to CSR for row-based indexing arrays = [a.tocsr() if issparse(a) else a for a in arrays] - resampled_arrays = [_safe_indexing(a, indices) for a in arrays] + + if copy: + resampled_arrays = [_safe_indexing(a, indices) for a in arrays] + else: + for a in arrays: + a[:] = _safe_indexing(a, indices) + resampled_arrays = arrays + if len(resampled_arrays) == 1: # syntactic sugar for the unit argument case return resampled_arrays[0] @@ -600,7 +625,7 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify= return resampled_arrays -def shuffle(*arrays, random_state=None, n_samples=None): +def shuffle(*arrays, random_state=None, n_samples=None, copy=True): """Shuffle arrays or sparse matrices in a consistent way. This is a convenience alias to ``resample(*arrays, replace=False)`` to do @@ -623,11 +648,17 @@ def shuffle(*arrays, random_state=None, n_samples=None): automatically set to the first dimension of the arrays. It should not be larger than the length of arrays. + copy : bool, default=True + Flag to return the copied array. If set False, + indexable data-structures in `*arrays` are destructed + instead of consuming less memory. + Returns ------- shuffled_arrays : sequence of indexable data-structures Sequence of shuffled copies of the collections. The original arrays - are not impacted. + are not impacted if `copy=True`. + If `copy=False`, the original arrays are shuffled too. Examples -------- @@ -666,8 +697,13 @@ def shuffle(*arrays, random_state=None, n_samples=None): -------- resample """ + return resample( - *arrays, replace=False, n_samples=n_samples, random_state=random_state + *arrays, + replace=False, + n_samples=n_samples, + random_state=random_state, + copy=copy, ) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 24638d26b6138..60617f0dc516a 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -116,6 +116,14 @@ def test_resample(): resample([0], [0, 1]) with pytest.raises(ValueError): resample([0, 1], [0, 1], replace=False, n_samples=3) + with pytest.raises(ValueError): + resample([0], [1], copy=False) + with pytest.raises(ValueError): + resample( + MockDataFrame(np.array([["a", 0], ["b", 1]], dtype=object)), + replace=False, + copy=False, + ) # Issue:6581, n_samples can be more when replace is True (default). assert len(resample([1, 2], n_samples=5)) == 5 @@ -150,6 +158,7 @@ def test_resample_stratified_replace(): X_no_replace, _ = resample( X, y, replace=False, n_samples=50, random_state=rng, stratify=y ) + assert np.unique(X_replace).shape[0] < 50 assert np.unique(X_no_replace).shape[0] == 50 @@ -183,6 +192,14 @@ def test_resample_stratify_sparse_error(): X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify) +def test_resample_without_copy(): + rng = np.random.RandomState(0) + n_samples = 100 + X = rng.normal(size=(n_samples, 1)) + + assert id(X) == id(resample(X, replace=False, copy=False)) + + def test_safe_mask(): random_state = check_random_state(0) X = random_state.rand(5, 4) @@ -514,17 +531,19 @@ def test_get_column_indices_pandas_nonunique_columns_error(key): assert str(exc_info.value) == err_msg -def test_shuffle_on_ndim_equals_three(): +@pytest.mark.parametrize("copy", [True, False]) +def test_shuffle_on_ndim_equals_three(copy): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2) S = set(to_tuple(A)) - shuffle(A) # shouldn't raise a ValueError for dim = 3 + shuffle(A.copy(), copy=copy) # shouldn't raise a ValueError for dim = 3 assert set(to_tuple(A)) == S -def test_shuffle_dont_convert_to_array(): +@pytest.mark.parametrize("copy", [True, False]) +def test_shuffle_dont_convert_to_array(copy): # Check that shuffle does not try to convert to numpy arrays with float # dtypes can let any indexable datastructure pass-through. a = ["a", "b", "c"] @@ -532,7 +551,11 @@ def test_shuffle_dont_convert_to_array(): c = [1, 2, 3] d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object)) e = sp.csc_matrix(np.arange(6).reshape(3, 2)) - a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0) + + if copy: + a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0, copy=copy) + else: + a_s, b_s, c_s, e_s = shuffle(a, b, c, e, random_state=0, copy=copy) assert a_s == ["c", "b", "a"] assert type(a_s) == list @@ -543,12 +566,21 @@ def test_shuffle_dont_convert_to_array(): assert c_s == [3, 2, 1] assert type(c_s) == list - assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object)) - assert type(d_s) == MockDataFrame + if copy: + assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object)) + assert type(d_s) == MockDataFrame assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]])) +def test_shuffle_without_copy(): + rng = np.random.RandomState(0) + n_samples = 100 + X = rng.normal(size=(n_samples, 1)) + + assert id(X) == id(shuffle(X, copy=False)) + + def test_gen_even_slices(): # check that gen_even_slices contains all samples some_range = range(10)