punndcoder28
diff --git a/‎sklearn/datasets/_base.py
Lines changed: 38 additions & 16 deletions b/‎sklearn/datasets/_base.py
Lines changed: 38 additions & 16 deletions
diff --git a/‎sklearn/datasets/tests/test_base.py
Lines changed: 3 additions & 2 deletions b/‎sklearn/datasets/tests/test_base.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎sklearn/datasets/tests/test_openml.py
Lines changed: 8 additions & 5 deletions b/‎sklearn/datasets/tests/test_openml.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎sklearn/datasets/tests/test_svmlight_format.py
Lines changed: 32 additions & 30 deletions b/‎sklearn/datasets/tests/test_svmlight_format.py
Lines changed: 32 additions & 30 deletions
@@ -12,6 +12,7 @@
 import os
 import shutil
 from collections import namedtuple
+from importlib import resources
 from numbers import Integral
 from os import environ, listdir, makedirs
 from os.path import expanduser, isdir, join, splitext
@@ -23,7 +24,6 @@
 from ..preprocessing import scale
 from ..utils import Bunch, check_pandas_support, check_random_state
 from ..utils._param_validation import Interval, StrOptions, validate_params
-from ..utils.fixes import _contents, _open_binary, _open_text, _read_text
 
 DATA_MODULE = "sklearn.datasets.data"
 DESCR_MODULE = "sklearn.datasets.descr"
@@ -300,6 +300,7 @@ def load_csv_data(
     data_module=DATA_MODULE,
     descr_file_name=None,
     descr_module=DESCR_MODULE,
+    encoding="utf-8",
 ):
     """Loads `data_file_name` from `data_module with `importlib.resources`.
 
@@ -339,8 +340,14 @@ def load_csv_data(
     descr : str, optional
         Description of the dataset (the content of `descr_file_name`).
         Only returned if `descr_file_name` is not None.
+
+    encoding : str, optional
+        Text encoding of the CSV file.
+
+        .. versionadded:: 1.4
     """
-    with _open_text(data_module, data_file_name) as csv_file:
+    data_path = resources.files(data_module) / data_file_name
+    with data_path.open("r", encoding="utf-8") as csv_file:
         data_file = csv.reader(csv_file)
         temp = next(data_file)
         n_samples = int(temp[0])
@@ -413,7 +420,8 @@ def load_gzip_compressed_csv_data(
         Description of the dataset (the content of `descr_file_name`).
         Only returned if `descr_file_name` is not None.
     """
-    with _open_binary(data_module, data_file_name) as compressed_file:
+    data_path = resources.files(data_module) / data_file_name
+    with data_path.open("rb") as compressed_file:
         compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding)
         data = np.loadtxt(compressed_file, **kwargs)
 
@@ -425,7 +433,7 @@ def load_gzip_compressed_csv_data(
         return data, descr
 
 
-def load_descr(descr_file_name, *, descr_module=DESCR_MODULE):
+def load_descr(descr_file_name, *, descr_module=DESCR_MODULE, encoding="utf-8"):
     """Load `descr_file_name` from `descr_module` with `importlib.resources`.
 
     Parameters
@@ -440,14 +448,19 @@ def load_descr(descr_file_name, *, descr_module=DESCR_MODULE):
         Module where `descr_file_name` lives. See also :func:`load_descr`.
         The default  is `'sklearn.datasets.descr'`.
 
+    encoding : str, default="utf-8"
+        Name of the encoding that `descr_file_name` will be decoded with.
+        The default is 'utf-8'.
+
+        .. versionadded:: 1.4
+
     Returns
     -------
     fdescr : str
         Content of `descr_file_name`.
     """
-    fdescr = _read_text(descr_module, descr_file_name)
-
-    return fdescr
+    path = resources.files(descr_module) / descr_file_name
+    return path.read_text(encoding=encoding)
 
 
 @validate_params(
@@ -1193,13 +1206,16 @@ def load_linnerud(*, return_X_y=False, as_frame=False):
     data_filename = "linnerud_exercise.csv"
     target_filename = "linnerud_physiological.csv"
 
+    data_module_path = resources.files(DATA_MODULE)
     # Read header and data
-    with _open_text(DATA_MODULE, data_filename) as f:
+    data_path = data_module_path / data_filename
+    with data_path.open("r", encoding="utf-8") as f:
         header_exercise = f.readline().split()
         f.seek(0)  # reset file obj
         data_exercise = np.loadtxt(f, skiprows=1)
 
-    with _open_text(DATA_MODULE, target_filename) as f:
+    target_path = data_module_path / target_filename
+    with target_path.open("r", encoding="utf-8") as f:
         header_physiological = f.readline().split()
         f.seek(0)  # reset file obj
         data_physiological = np.loadtxt(f, skiprows=1)
@@ -1277,13 +1293,19 @@ def load_sample_images():
     descr = load_descr("README.txt", descr_module=IMAGES_MODULE)
 
     filenames, images = [], []
-    for filename in sorted(_contents(IMAGES_MODULE)):
-        if filename.endswith(".jpg"):
-            filenames.append(filename)
-            with _open_binary(IMAGES_MODULE, filename) as image_file:
-                pil_image = Image.open(image_file)
-                image = np.asarray(pil_image)
-            images.append(image)
+
+    jpg_paths = sorted(
+        resource
+        for resource in resources.files(IMAGES_MODULE).iterdir()
+        if resource.is_file() and resource.match("*.jpg")
+    )
+
+    for path in jpg_paths:
+        filenames.append(str(path))
+        with path.open("rb") as image_file:
+            pil_image = Image.open(image_file)
+            image = np.asarray(pil_image)
+        images.append(image)
 
     return Bunch(images=images, filenames=filenames, DESCR=descr)
 
 
@@ -3,6 +3,7 @@
 import tempfile
 import warnings
 from functools import partial
+from importlib import resources
 from pathlib import Path
 from pickle import dumps, loads
 
@@ -29,7 +30,6 @@
 from sklearn.datasets.tests.test_common import check_as_frame
 from sklearn.preprocessing import scale
 from sklearn.utils import Bunch
-from sklearn.utils.fixes import _is_resource
 
 
 class _DummyPath:
@@ -291,7 +291,8 @@ def test_loader(loader_func, data_shape, target_shape, n_target, has_descr, file
         assert "data_module" in bunch
         assert all(
             [
-                f in bunch and _is_resource(bunch["data_module"], bunch[f])
+                f in bunch
+                and (resources.files(bunch["data_module"]) / bunch[f]).is_file()
                 for f in filenames
             ]
         )
 
@@ -4,6 +4,7 @@
 import os
 import re
 from functools import partial
+from importlib import resources
 from io import BytesIO
 from urllib.error import HTTPError
 
@@ -27,7 +28,6 @@
     assert_array_equal,
     fails_if_pypy,
 )
-from sklearn.utils.fixes import _open_binary
 
 OPENML_TEST_DATA_MODULE = "sklearn.datasets.tests.data.openml"
 # if True, urlopen will be monkey patched to only use local files
@@ -107,8 +107,9 @@ def _mock_urlopen_shared(url, has_gzip_header, expected_prefix, suffix):
         assert url.startswith(expected_prefix)
 
         data_file_name = _file_name(url, suffix)
+        data_file_path = resources.files(data_module) / data_file_name
 
-        with _open_binary(data_module, data_file_name) as f:
+        with data_file_path.open("rb") as f:
             if has_gzip_header and gzip_response:
                 fp = BytesIO(f.read())
                 return _MockHTTPResponse(fp, True)
@@ -145,9 +146,10 @@ def _mock_urlopen_data_list(url, has_gzip_header):
         assert url.startswith(url_prefix_data_list)
 
         data_file_name = _file_name(url, ".json")
+        data_file_path = resources.files(data_module) / data_file_name
 
         # load the file itself, to simulate a http error
-        with _open_binary(data_module, data_file_name) as f:
+        with data_file_path.open("rb") as f:
             decompressed_f = read_fn(f, "rb")
             decoded_s = decompressed_f.read().decode("utf-8")
             json_data = json.loads(decoded_s)
@@ -156,7 +158,7 @@ def _mock_urlopen_data_list(url, has_gzip_header):
                 url=None, code=412, msg="Simulated mock error", hdrs=None, fp=None
             )
 
-        with _open_binary(data_module, data_file_name) as f:
+        with data_file_path.open("rb") as f:
             if has_gzip_header:
                 fp = BytesIO(f.read())
                 return _MockHTTPResponse(fp, True)
@@ -1507,8 +1509,9 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir, pars
     # create a temporary modified arff file
     original_data_module = OPENML_TEST_DATA_MODULE + "." + f"id_{data_id}"
     original_data_file_name = "data-v1-dl-1666876.arff.gz"
+    original_data_path = resources.files(original_data_module) / original_data_file_name
     corrupt_copy_path = tmpdir / "test_invalid_checksum.arff"
-    with _open_binary(original_data_module, original_data_file_name) as orig_file:
+    with original_data_path.open("rb") as orig_file:
         orig_gzip = gzip.open(orig_file, "rb")
         data = bytearray(orig_gzip.read())
         data[len(data) - 1] = 37
 
@@ -2,6 +2,7 @@
 import os
 import shutil
 from bz2 import BZ2File
+from importlib import resources
 from io import BytesIO
 from tempfile import NamedTemporaryFile
 
@@ -17,7 +18,7 @@
     assert_array_equal,
     fails_if_pypy,
 )
-from sklearn.utils.fixes import CSR_CONTAINERS, _open_binary, _path
+from sklearn.utils.fixes import CSR_CONTAINERS
 
 TEST_DATA_MODULE = "sklearn.datasets.tests.data"
 datafile = "svmlight_classification.txt"
@@ -28,11 +29,16 @@
 pytestmark = fails_if_pypy
 
 
+def _svmlight_local_test_file_path(filename):
+    return resources.files(TEST_DATA_MODULE) / filename
+
+
 def _load_svmlight_local_test_file(filename, **kwargs):
     """
     Helper to load resource `filename` with `importlib.resources`
     """
-    with _open_binary(TEST_DATA_MODULE, filename) as f:
+    data_path = _svmlight_local_test_file_path(filename)
+    with data_path.open("rb") as f:
         return load_svmlight_file(f, **kwargs)
 
 
@@ -76,24 +82,25 @@ def test_load_svmlight_file_fd():
 
     # GH20081: testing equality between path-based and
     # fd-based load_svmlight_file
-    with _path(TEST_DATA_MODULE, datafile) as data_path:
-        data_path = str(data_path)
-        X1, y1 = load_svmlight_file(data_path)
 
-        fd = os.open(data_path, os.O_RDONLY)
-        try:
-            X2, y2 = load_svmlight_file(fd)
-            assert_array_almost_equal(X1.data, X2.data)
-            assert_array_almost_equal(y1, y2)
-        finally:
-            os.close(fd)
+    data_path = resources.files(TEST_DATA_MODULE) / datafile
+    data_path = str(data_path)
+    X1, y1 = load_svmlight_file(data_path)
+
+    fd = os.open(data_path, os.O_RDONLY)
+    try:
+        X2, y2 = load_svmlight_file(fd)
+        assert_array_almost_equal(X1.data, X2.data)
+        assert_array_almost_equal(y1, y2)
+    finally:
+        os.close(fd)
 
 
 def test_load_svmlight_pathlib():
     # test loading from file descriptor
-    with _path(TEST_DATA_MODULE, datafile) as data_path:
-        X1, y1 = load_svmlight_file(str(data_path))
-        X2, y2 = load_svmlight_file(data_path)
+    data_path = _svmlight_local_test_file_path(datafile)
+    X1, y1 = load_svmlight_file(str(data_path))
+    X2, y2 = load_svmlight_file(data_path)
 
     assert_allclose(X1.data, X2.data)
     assert_allclose(y1, y2)
@@ -105,19 +112,16 @@ def test_load_svmlight_file_multilabel():
 
 
 def test_load_svmlight_files():
-    with _path(TEST_DATA_MODULE, datafile) as data_path:
-        X_train, y_train, X_test, y_test = load_svmlight_files(
-            [str(data_path)] * 2, dtype=np.float32
-        )
+    data_path = _svmlight_local_test_file_path(datafile)
+    X_train, y_train, X_test, y_test = load_svmlight_files(
+        [str(data_path)] * 2, dtype=np.float32
+    )
     assert_array_equal(X_train.toarray(), X_test.toarray())
     assert_array_almost_equal(y_train, y_test)
     assert X_train.dtype == np.float32
     assert X_test.dtype == np.float32
 
-    with _path(TEST_DATA_MODULE, datafile) as data_path:
-        X1, y1, X2, y2, X3, y3 = load_svmlight_files(
-            [str(data_path)] * 3, dtype=np.float64
-        )
+    X1, y1, X2, y2, X3, y3 = load_svmlight_files([str(data_path)] * 3, dtype=np.float64)
     assert X1.dtype == X2.dtype
     assert X2.dtype == X3.dtype
     assert X3.dtype == np.float64
@@ -145,7 +149,7 @@ def test_load_compressed():
 
     with NamedTemporaryFile(prefix="sklearn-test", suffix=".gz") as tmp:
         tmp.close()  # necessary under windows
-        with _open_binary(TEST_DATA_MODULE, datafile) as f:
+        with _svmlight_local_test_file_path(datafile).open("rb") as f:
             with gzip.open(tmp.name, "wb") as fh_out:
                 shutil.copyfileobj(f, fh_out)
         Xgz, ygz = load_svmlight_file(tmp.name)
@@ -157,7 +161,7 @@ def test_load_compressed():
 
     with NamedTemporaryFile(prefix="sklearn-test", suffix=".bz2") as tmp:
         tmp.close()  # necessary under windows
-        with _open_binary(TEST_DATA_MODULE, datafile) as f:
+        with _svmlight_local_test_file_path(datafile).open("rb") as f:
             with BZ2File(tmp.name, "wb") as fh_out:
                 shutil.copyfileobj(f, fh_out)
         Xbz, ybz = load_svmlight_file(tmp.name)
@@ -236,11 +240,9 @@ def test_load_large_qid():
 
 def test_load_invalid_file2():
     with pytest.raises(ValueError):
-        with (
-            _path(TEST_DATA_MODULE, datafile) as data_path,
-            _path(TEST_DATA_MODULE, invalidfile) as invalid_path,
-        ):
-            load_svmlight_files([str(data_path), str(invalid_path), str(data_path)])
+        data_path = _svmlight_local_test_file_path(datafile)
+        invalid_path = _svmlight_local_test_file_path(invalidfile)
+        load_svmlight_files([str(data_path), str(invalid_path), str(data_path)])
 
 
 def test_not_a_filename():