8000 ENH Implement utils.shuffle without copy #7754 by murata-yu · Pull Request #22003 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH Implement utils.shuffle without copy #7754 #22003

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats_new/v1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1144,6 +1144,12 @@ Changelog
- |API| Fixed several bugs in :func:`utils.graph.graph_shortest_path`, which is
now deprecated. Use `scipy.sparse.csgraph.shortest_path` instead. :pr:`20531`
by `Tom Dupre la Tour`_.

- |Enhancement| Added `copy` args to :func:`utils.shuffle`. If `copy=False`,
indexable data-structures in `*arrays` are destructed
instead of consuming less memory. :func:`utils.shuffle` without copy does not
support :class:`utils._mocking.MockDataFrame`. It raises `ValueError`.
:pr:`22003` by :user:`murata-yu`.

Code and Documentation Contributors
-----------------------------------
Expand Down
48 changes: 42 additions & 6 deletions sklearn/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,9 @@ def _get_column_indices(X, key):
)


def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None):
def resample(
*arrays, replace=True, n_samples=None, random_state=None, stratify=None, copy=True
):
"""Resample arrays or sparse matrices in a consistent way.

The default strategy implements one step of the bootstrapping
Expand Down Expand Up @@ -484,12 +486,19 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=
If not None, data is split in a stratified fashion, using this as
the class labels.

copy : bool, default=True
Flag to return the copied array. This frag works if `replace=False`.
If `replace=False` and `copy=False`,
indexable data-structures in `*arrays` are destructed
instead of consuming less memory.

Returns
-------
resampled_arrays : sequence of array-like of shape (n_samples,) or \
(n_samples, n_outputs)
Sequence of resampled copies of the collections. The original arrays
are not impacted.
are not impacted if `copy=True`.
If `replace=False` and `copy=False`, the original arrays are resampled too.

Examples
--------
Expand Down Expand Up @@ -552,6 +561,15 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=
% (max_n_samples, n_samples)
)

if not copy:
if replace:
raise ValueError("`copy=False` is valid only when `replace=False`")

# FIXME: Make sure MockDataFrame also supports `copy=False``
for a in arrays:
if hasattr(a, "iloc"):
raise ValueError("MockDataFrame does not support `copy=False`")

check_consistent_length(*arrays)

if stratify is None:
Expand Down Expand Up @@ -592,15 +610,22 @@ def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=

# convert sparse matrices to CSR for row-based indexing
arrays = [a.tocsr() if issparse(a) else a for a in arrays]
resampled_arrays = [_safe_indexing(a, indices) for a in arrays]

if copy:
resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
else:
for a in arrays:
a[:] = _safe_indexing(a, indices)
Copy link
Member
@ogrisel ogrisel Dec 17, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately I am pretty sure that _safe_indexing(a, indices) still produces an intermediate allocation of the size of a.

I it is not visible in the memory profile because this way of profiling the memory measures the memory usage before and after line 618, but not during the execution of line 618.

To confirm this (if you do not trust me :) you can try to write a Python script that calls shuffle(X) on a larger X (e.g. 1GB) and run it with memory_profiler in "sampling" mode (also known as "time-based mode") with the mprof run command instead of the line-by-line tracing mode.

I think the only way to implement in-place shuffling memory efficiently would be to implement the Durstenfeld variant of the Fisher–Yates algorithm:

https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_modern_algorithm

EDIT: or Sattolo's algorithm:

https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#Sattolo's_algorithm

This has several drawbacks though:

  • first it needs an explicit loop of size n_samples which might be slow to do in Python and would be much faster in Cython.
  • however while implementing it Cython should be easily doable when all inputs are numpy arrays, this would be much more challenging for other input data structures (e.g. Python lists, pandas DataFrames/Series, and so on.
  • finally inplace assignments for pairs of indices that would work for all the datastructures supported by _safe_indexing would require a similar utility function (e.g. _safe_index_assign(X, indices, values)) because index based assignment has no consistent API for all those libraries.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for your comment.
I understand that the problem is complicated.

This issue seems difficult for me to solve, so I'm sorry but I would like to close this PR...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no problem.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There might be a way forward to use numpy.random.shuffle as explained in #7754 (comment) but I am not sure how complex it would be to get the details right.

resampled_arrays = arrays

if len(resampled_arrays) == 1:
# syntactic sugar for the unit argument case
return resampled_arrays[0]
else:
return resampled_arrays


def shuffle(*arrays, random_state=None, n_samples=None):
def shuffle(*arrays, random_state=None, n_samples=None, copy=True):
"""Shuffle arrays or sparse matrices in a consistent way.

This is a convenience alias to ``resample(*arrays, replace=False)`` to do
Expand All @@ -623,11 +648,17 @@ def shuffle(*arrays, random_state=None, n_samples=None):
automatically set to the first dimension of the arrays. It should
not be larger than the length of arrays.

copy : bool, default=True
Flag to return the copied array. If set False,
indexable data-structures in `*arrays` are destructed
instead of consuming less memory.

Returns
-------
shuffled_arrays : sequence of indexable data-structures
Sequence of shuffled copies of the collections. The original arrays
are not impacted.
are not impacted if `copy=True`.
If `copy=False`, the original arrays are shuffled too.

Examples
--------
Expand Down Expand Up @@ -666,8 +697,13 @@ def shuffle(*arrays, random_state=None, n_samples=None):
--------
resample
"""

return resample(
*arrays, replace=False, n_samples=n_samples, random_state=random_state
*arrays,
replace=False,
n_samples=n_samples,
random_state=random_state,
copy=copy,
)


Expand Down
44 changes: 38 additions & 6 deletions sklearn/utils/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,14 @@ def test_resample():
resample([0], [0, 1])
with pytest.raises(ValueError):
resample([0, 1], [0, 1], replace=False, n_samples=3)
with pytest.raises(ValueError):
resample([0], [1], copy=False)
with pytest.raises(ValueError):
resample(
MockDataFrame(np.array([["a", 0], ["b", 1]], dtype=object)),
replace=False,
copy=False,
)

# Issue:6581, n_samples can be more when replace is True (default).
assert len(resample([1, 2], n_samples=5)) == 5
Expand Down Expand Up @@ -150,6 +158,7 @@ def test_resample_stratified_replace():
X_no_replace, _ = resample(
X, y, replace=False, n_samples=50, random_state=rng, stratify=y
)

assert np.unique(X_replace).shape[0] < 50
assert np.unique(X_no_replace).shape[0] == 50

Expand Down Expand Up @@ -183,6 +192,14 @@ def test_resample_stratify_sparse_error():
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)


def test_resample_without_copy():
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))

assert id(X) == id(resample(X, replace=False, copy=False))


def test_safe_mask():
random_state = check_random_state(0)
X = random_state.rand(5, 4)
Expand Down Expand Up @@ -514,25 +531,31 @@ def test_get_column_indices_pandas_nonunique_columns_error(key):
assert str(exc_info.value) == err_msg


def test_shuffle_on_ndim_equals_three():
@pytest.mark.parametrize("copy", [True, False])
def test_shuffle_on_ndim_equals_three(copy):
def to_tuple(A): # to make the inner arrays hashable
return tuple(tuple(tuple(C) for C in B) for B in A)

A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2)
S = set(to_tuple(A))
shuffle(A) # shouldn't raise a ValueError for dim = 3
shuffle(A.copy(), copy=copy) # shouldn't raise a ValueError for dim = 3
assert set(to_tuple(A)) == S


def test_shuffle_dont_convert_to_array():
@pytest.mark.parametrize("copy", [True, False])
def test_shuffle_dont_convert_to_array(copy):
# Check that shuffle does not try to convert to numpy arrays with float
# dtypes can let any indexable datastructure pass-through.
a = ["a", "b", "c"]
b = np.array(["a", "b", "c"], dtype=object)
c = [1, 2, 3]
d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object))
e = sp.csc_matrix(np.arange(6).reshape(3, 2))
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)

if copy:
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0, copy=copy)
else:
a_s, b_s, c_s, e_s = shuffle(a, b, c, e, random_state=0, copy=copy)

assert a_s == ["c", "b", "a"]
assert type(a_s) == list
Expand All @@ -543,12 +566,21 @@ def test_shuffle_dont_convert_to_array():
assert c_s == [3, 2, 1]
assert type(c_s) == list

assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
assert type(d_s) == MockDataFrame
if copy:
assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
assert type(d_s) == MockDataFrame

assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))


def test_shuffle_without_copy():
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))

assert id(X) == id(shuffle(X, copy=False))


def test_gen_even_slices():
# check that gen_even_slices contains all samples
some_range = range(10)
Expand Down
0