8000 [MRG] FIX segmentation fault on memory mapped contiguous memoryview by lorentzenchr · Pull Request #21654 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] FIX segmentation fault on memory mapped contiguous memoryview #21654

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Nov 18, 2021
Merged
4 changes: 2 additions & 2 deletions sklearn/utils/_readonly_array_wrapper.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@ cdef class ReadonlyArrayWrapper:
PyBuffer_Release(buffer)


def _test_sum(NUM_TYPES[:] x):
def _test_sum(NUM_TYPES[::1] x):
"""This function is for testing only.

As this function does not modify x, we would like to define it as

_test_sum(const NUM_TYPES[:] x)
_test_sum(const NUM_TYPES[::1] x)

which is not possible as fused typed const memoryviews aren't
supported in Cython<3.0.
Expand Down
25 changes: 21 additions & 4 deletions sklearn/utils/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,19 +520,36 @@ def __exit__(self, exc_type, exc_val, exc_tb):
_delete_folder(self.temp_folder)


def create_memmap_backed_data(data, mmap_mode="r", return_folder=False):
def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False):
"""
Parameters
----------
data
mmap_mode : str, default='r'
return_folder : bool, default=False
aligned : bool, default=False
If True, if input is a single numpy array and if the input array is aligned,
the memory mapped array will also be aligned. This is a workaround for
https://github.com/joblib/joblib/issues/563.
Comment on lines +530 to +533
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
aligned : bool, default=False
If True, if input is a single numpy array and if the input array is aligned,
the memory mapped array will also be aligned. This is a workaround for
https://github.com/joblib/joblib/issues/563.
aligned : bool, default=False
If True, if input is a single numpy array and if the input array is aligned,
the memory mapped array will also be aligned. This is a workaround for
https://github.com/joblib/joblib/issues/563 .

"""
temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_")
atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))
filename = op.join(temp_folder, "data.pkl")
joblib.dump(data, filename)
memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
if aligned:
if isinstance(data, np.ndarray) and data.flags.aligned:
# https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
filename = op.join(temp_folder, "data.dat")
fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape)
fp[:] = data[:] # write data to memmap array
fp.flush()
memmap_backed_data = np.memmap(
filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape
)
else:
raise ValueError("If aligned=True, input must be a single numpy array.")
else:
filename = op.join(temp_folder, "data.pkl")
Comment on lines +537 to +550
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if aligned:
if isinstance(data, np.ndarray) and data.flags.aligned:
# https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
filename = op.join(temp_folder, "data.dat")
fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape)
fp[:] = data[:] # write data to memmap array
fp.flush()
memmap_backed_data = np.memmap(
filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape
)
else:
raise ValueError("If aligned=True, input must be a single numpy array.")
else:
filename = op.join(temp_folder, "data.pkl")
# TODO: remove the aligned case once https://github.com/joblib/joblib/issues/563
# is fixed.
if aligned:
if isinstance(data, np.ndarray) and data.flags.aligned:
# https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
filename = op.join(temp_folder, "data.dat")
fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape)
fp[:] = data[:] # write data to memmap array
fp.flush()
memmap_backed_data = np.memmap(
filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape
)
else:
raise ValueError("If aligned=True, input must be a single numpy array.")
else:
filename = op.join(temp_folder, "data.pkl")

joblib.dump(data, filename)
memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
result = (
memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder)
)
Expand Down
8 changes: 7 additions & 1 deletion sklearn/utils/tests/test_readonly_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@ def _readonly_array_copy(x):
return y


@pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data])
def _create_memmap_backed_data(data):
return create_memmap_backed_data(
data, mmap_mode="r", return_folder=False, aligned=True
)


@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data])
Comment on lines +16 to +22
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick. Note that this change might not be black-compliant.

Suggested change
def _create_memmap_backed_data(data):
return create_memmap_backed_data(
data, mmap_mode="r", return_folder=False, aligned=True
)
@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data])
# TODO: remove this fixture and its associated parametrization
# once https://github.com/joblib/joblib/issues/563 is fixed.
def _create_aligned_memmap_backed_data(data):
return create_memmap_backed_data(
data, mmap_mode="r", return_folder=False, aligned=True
)
@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_aligned_memmap_backed_data])

@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
def test_readonly_array_wrapper(readonly, dtype):
"""Test that ReadonlyWrapper allows working with fused-typed."""
Expand Down
46 changes: 38 additions & 8 deletions sklearn/utils/tests/test_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from sklearn.utils.deprecation import deprecated
from sklearn.utils.metaestimators import available_if, if_delegate_has_method
from sklearn.utils._readonly_array_wrapper import _test_sum
from sklearn.utils._testing import (
assert_raises,
assert_warns,
Expand Down Expand Up @@ -680,30 +681,59 @@ def test_tempmemmap(monkeypatch):
assert registration_counter.nb_calls == 2


def test_create_memmap_backed_data(monkeypatch):
@pytest.mark.parametrize("aligned", [False, True])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
@pytest.mark.parametrize("aligned", [False, True])
# TODO: remove this test parametrization on aligned once
# https://github.com/joblib/joblib/issues/563 is fixed.
@pytest.mark.parametrize("aligned", [False, True])

def test_create_memmap_backed_data(monkeypatch, aligned):
registration_counter = RegistrationCounter()
monkeypatch.setattr(atexit, "register", registration_counter)

input_array = np.ones(3)
data = create_memmap_backed_data(input_array)
data = create_memmap_backed_data(input_array, aligned=aligned)
check_memmap(input_array, data)
assert registration_counter.nb_calls == 1

data, folder = create_memmap_backed_data(input_array, return_folder=True)
data, folder = create_memmap_backed_data(
input_array, return_folder=True, aligned=aligned
)
check_memmap(input_array, data)
assert folder == os.path.dirname(data.filename)
assert registration_counter.nb_calls == 2

mmap_mode = "r+"
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode, aligned=aligned)
check_memmap(input_array, data, mmap_mode)
assert registration_counter.nb_calls == 3

input_list = [input_array, input_array + 1, input_array + 2]
mmap_data_list = create_memmap_backed_data(input_list)
for input_array, data in zip(input_list, mmap_data_list):
check_memmap(input_array, data)
assert registration_counter.nb_calls == 4
if aligned:
with pytest.raises(
ValueError, match="If aligned=True, input must be a single numpy array."
):
create_memmap_backed_data(input_list, aligned=True)
else:
mmap_data_list = create_memmap_backed_data(input_list, aligned=False)
for input_array, data in zip(input_list, mmap_data_list):
check_memmap(input_array, data)
assert registration_counter.nb_calls == 4


@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
def test_memmap_on_contiguous_data(dtype):
"""Test memory mapped array on contigous memoryview."""
x = np.arange(10).astype(dtype)
assert x.flags["C_CONTIGUOUS"]
assert x.flags["ALIGNED"]

# _test_sum consumes contiguous arrays
# def _test_sum(NUM_TYPES[::1] x):
sum_origin = _test_sum(x)

# now on memory mapped data
# aligned=True so avoid https://github.com/joblib/joblib/issues/563
# without alignment, this can produce segmentation faults, see
# https://github.com/scikit-learn/scikit-learn/pull/21654
x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True)
sum_mmap = _test_sum(x_mmap)
assert sum_mmap == pytest.approx(sum_origin, rel=1e-11)
Comment on lines +718 to +736
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
def test_memmap_on_contiguous_data(dtype):
"""Test memory mapped array on contigous memoryview."""
x = np.arange(10).astype(dtype)
assert x.flags["C_CONTIGUOUS"]
assert x.flags["ALIGNED"]
# _test_sum consumes contiguous arrays
# def _test_sum(NUM_TYPES[::1] x):
sum_origin = _test_sum(x)
# now on memory mapped data
# aligned=True so avoid https://github.com/joblib/joblib/issues/563
# without alignment, this can produce segmentation faults, see
# https://github.com/scikit-learn/scikit-learn/pull/21654
x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True)
sum_mmap = _test_sum(x_mmap)
assert sum_mmap == pytest.approx(sum_origin, rel=1e-11)
# TODO: remove once https://github.com/joblib/joblib/issues/563 is fixed.
@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
def test_memmap_on_contiguous_data(dtype):
"""Test memory mapped array on contigous memoryview."""
x = np.arange(10).astype(dtype)
assert x.flags["C_CONTIGUOUS"]
assert x.flags["ALIGNED"]
# _test_sum consumes contiguous arrays
# def _test_sum(NUM_TYPES[::1] x):
sum_origin = _test_sum(x)
# now on memory mapped data
# aligned=True so avoid https://github.com/joblib/joblib/issues/563
# without alignment, this can produce segmentation faults, see
# https://github.com/scikit-learn/scikit-learn/pull/21654
x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True)
sum_mmap = _test_sum(x_mmap)
assert sum_mmap == pytest.approx(sum_origin, rel=1e-11)



@pytest.mark.parametrize(
Expand Down
0