8000 [MRG] FIX segmentation fault on memory mapped contiguous memoryview (… · scikit-learn/scikit-learn@47b2e33 · GitHub
[go: up one dir, main page]

Skip to content

Commit 47b2e33

Browse files
lorentzenchrglemaitre
authored andcommitted
[MRG] FIX segmentation fault on memory mapped contiguous memoryview (#21654)
1 parent 05e7064 commit 47b2e33

File tree

4 files changed

+68
-15
lines changed

4 files changed

+68
-15
lines changed

sklearn/utils/_readonly_array_wrapper.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,12 @@ cdef class ReadonlyArrayWrapper:
4848
PyBuffer_Release(buffer)
4949

5050

51-
def _test_sum(NUM_TYPES[:] x):
51+
def _test_sum(NUM_TYPES[::1] x):
5252
"""This function is for testing only.
5353
5454
As this function does not modify x, we would like to define it as
5555
56-
_test_sum(const NUM_TYPES[:] x)
56+
_test_sum(const NUM_TYPES[::1] x)
5757
5858
which is not possible as fused typed const memoryviews aren't
5959
supported in Cython<3.0.

sklearn/utils/_testing.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -520,19 +520,36 @@ def __exit__(self, exc_type, exc_val, exc_tb):
520520
_delete_folder(self.temp_folder)
521521

522522

523-
def create_memmap_backed_data(data, mmap_mode="r", return_folder=False):
523+
def create_memmap_backed_data(data, mmap_mode="r", return_folder=False, aligned=False):
524524
"""
525525
Parameters
526526
----------
527527
data
528528
mmap_mode : str, default='r'
529529
return_folder : bool, default=False
530+
aligned : bool, default=False
531+
If True, if input is a single numpy array and if the input array is aligned,
532+
the memory mapped array will also be aligned. This is a workaround for
533+
https://github.com/joblib/joblib/issues/563.
530534
"""
531535
temp_folder = tempfile.mkdtemp(prefix="sklearn_testing_")
532536
atexit.register(functools.partial(_delete_folder, temp_folder, warn=True))
533-
filename = op.join(temp_folder, "data.pkl")
534-
joblib.dump(data, filename)
535-
memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
537+
if aligned:
538+
if isinstance(data, np.ndarray) and data.flags.aligned:
539+
# https://numpy.org/doc/stable/reference/generated/numpy.memmap.html
540+
filename = op.join(temp_folder, "data.dat")
541+
fp = np.memmap(filename, dtype=data.dtype, mode="w+", shape=data.shape)
542+
fp[:] = data[:] # write data to memmap array
543+
fp.flush()
544+
memmap_backed_data = np.memmap(
545+
filename, dtype=data.dtype, mode=mmap_mode, shape=data.shape
546+
)
547+
else:
548+
raise ValueError("If aligned=True, input must be a single numpy array.")
549+
else:
550+
filename = op.join(temp_folder, "data.pkl")
551+
joblib.dump(data, filename)
552+
memmap_backed_data = joblib.load(filename, mmap_mode=mmap_mode)
536553
result = (
537554
memmap_backed_data if not return_folder else (memmap_backed_data, temp_folder)
538555
)

sklearn/utils/tests/test_readonly_wrapper.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,13 @@ def _readonly_array_copy(x):
1313
return y
1414

1515

16-
@pytest.mark.parametrize("readonly", [_readonly_array_copy, create_memmap_backed_data])
16+
def _create_memmap_backed_data(data):
17+
return create_memmap_backed_data(
18+
data, mmap_mode="r", return_folder=False, aligned=True
19+
)
20+
21+
22+
@pytest.mark.parametrize("readonly", [_readonly_array_copy, _create_memmap_backed_data])
1723
@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
1824
def test_readonly_array_wrapper(readonly, dtype):
1925
"""Test that ReadonlyWrapper allows working with fused-typed."""

sklearn/utils/tests/test_testing.py

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from sklearn.utils.deprecation import deprecated
1414
from sklearn.utils.metaestimators import available_if, if_delegate_has_method
15+
from sklearn.utils._readonly_array_wrapper import _test_sum
1516
from sklearn.utils._testing import (
1617
assert_raises,
1718
assert_warns,
@@ -680,30 +681,59 @@ def test_tempmemmap(monkeypatch):
680681
assert registration_counter.nb_calls == 2
681682

682683

683-
def test_create_memmap_backed_data(monkeypatch):
684+
@pytest.mark.parametrize("aligned", [False, True])
685+
def test_create_memmap_backed_data(monkeypatch, aligned):
684686
registration_counter = RegistrationCounter()
685687
monkeypatch.setattr(atexit, "register", registration_counter)
686688

687689
input_array = np.ones(3)
688-
data = create_memmap_backed_data(input_array)
690+
data = create_memmap_backed_data(input_array, aligned=aligned)
689691
check_memmap(input_array, data)
690692
assert registration_counter.nb_calls == 1
691693

692-
data, folder = create_memmap_backed_data(input_array, return_folder=True)
694+
data, folder = create_memmap_backed_data(
695+
input_array, return_folder=True, aligned=aligned
696+
)
693697
check_memmap(input_array, data)
694698
assert folder == os.path.dirname(data.filename)
695699
assert registration_counter.nb_calls == 2
696700

697701
mmap_mode = "r+"
698-
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
702+
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode, aligned=aligned)
699703
check_memmap(input_array, data, mmap_mode)
700704
assert registration_counter.nb_calls == 3
701705

702706
input_list = [input_array, input_array + 1, input_array + 2]
703-
mmap_data_list = create_memmap_backed_data(input_list)
704-
for input_array, data in zip(input_list, mmap_data_list):
705-
check_memmap(input_array, data)
706-
assert registration_counter.nb_calls == 4
707+
if aligned:
708+
with pytest.raises(
709+
ValueError, match="If aligned=True, input must be a single numpy array."
710+
):
711+
create_memmap_backed_data(input_list, aligned=True)
712+
else:
713+
mmap_data_list = create_memmap_backed_data(input_list, aligned=False)
714+
for input_array, data in zip(input_list, mmap_data_list):
715+
check_memmap(input_array, data)
716+
assert registration_counter.nb_calls == 4
717+
718+
719+
@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
720+
def test_memmap_on_contiguous_data(dtype):
721+
"""Test memory mapped array on contigous memoryview."""
722+
x = np.arange(10).astype(dtype)
723+
assert x.flags["C_CONTIGUOUS"]
724+
assert x.flags["ALIGNED"]
725+
726+
# _test_sum consumes contiguous arrays
727+
# def _test_sum(NUM_TYPES[::1] x):
728+
sum_origin = _test_sum(x)
729+
730+
# now on memory mapped data
731+
# aligned=True so avoid https://github.com/joblib/joblib/issues/563
732+
# without alignment, this can produce segmentation faults, see
733+
# https://github.com/scikit-learn/scikit-learn/pull/21654
734+
x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True)
735+
sum_mmap = _test_sum(x_mmap)
736+
assert sum_mmap == pytest.approx(sum_origin, rel=1e-11)
707737

708738

709739
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)
0