10000 MAINT Remove Python<3.9 code from sklearn.utils.fixes (#27945) · punndcoder28/scikit-learn@dc23e3f · GitHub
[go: up one dir, main page]

Skip to content

Commit dc23e3f

Browse files
lesteveogriselglemaitre
authored
MAINT Remove Python<3.9 code from sklearn.utils.fixes (scikit-learn#27945)
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org> Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
1 parent 2d4197d commit dc23e3f

File tree 8000

5 files changed

+81
-106
lines changed

5 files changed

+81
-106
lines changed

sklearn/datasets/_base.py

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import os
1313
import shutil
1414
from collections import namedtuple
15+
from importlib import resources
1516
from numbers import Integral
1617
from os import environ, listdir, makedirs
1718
from os.path import expanduser, isdir, join, splitext
@@ -23,7 +24,6 @@
2324
from ..preprocessing import scale
2425
from ..utils import Bunch, check_pandas_support, check_random_state
2526
from ..utils._param_validation import Interval, StrOptions, validate_params
26-
from ..utils.fixes import _contents, _open_binary, _open_text, _read_text
2727

2828
DATA_MODULE = "sklearn.datasets.data"
2929
DESCR_MODULE = "sklearn.datasets.descr"
@@ -300,6 +300,7 @@ def load_csv_data(
300300
data_module=DATA_MODULE,
301301
descr_file_name=None,
302302
descr_module=DESCR_MODULE,
303+
encoding="utf-8",
303304
):
304305
"""Loads `data_file_name` from `data_module with `importlib.resources`.
305306
@@ -339,8 +340,14 @@ def load_csv_data(
339340
descr : str, optional
340341
Description of the dataset (the content of `descr_file_name`).
341342
Only returned if `descr_file_name` is not None.
343+
344+
encoding : str, optional
345+
Text encoding of the CSV file.
346+
347+
.. versionadded:: 1.4
342348
"""
343-
with _open_text(data_module, data_file_name) as csv_file:
349+
data_path = resources.files(data_module) / data_file_name
350+
with data_path.open("r", encoding="utf-8") as csv_file:
344351
data_file = csv.reader(csv_file)
345352
temp = next(data_file)
346353
n_samples = int(temp[0])
@@ -413,7 +420,8 @@ def load_gzip_compressed_csv_data(
413420
Description of the dataset (the content of `descr_file_name`).
414421
Only returned if `descr_file_name` is not None.
415422
"""
416-
with _open_binary(data_module, data_file_name) as compressed_file:
423+
data_path = resources.files(data_module) / data_file_name
424+
with data_path.open("rb") as compressed_file:
417425
compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding)
418426
data = np.loadtxt(compressed_file, **kwargs)
419427

@@ -425,7 +433,7 @@ def load_gzip_compressed_csv_data(
425433
return data, descr
426434

427435

428-
def load_descr(descr_file_name, *, descr_module=DESCR_MODULE):
436+
def load_descr(descr_file_name, *, descr_module=DESCR_MODULE, encoding="utf-8"):
429437
"""Load `descr_file_name` from `descr_module` with `importlib.resources`.
430438
431439
Parameters
@@ -440,14 +448,19 @@ def load_descr(descr_file_name, *, descr_module=DESCR_MODULE):
440448
Module where `descr_file_name` lives. See also :func:`load_descr`.
441449
The default is `'sklearn.datasets.descr'`.
442450
451+
encoding : str, default="utf-8"
452+
Name of the encoding that `descr_file_name` will be decoded with.
453+
The default is 'utf-8'.
454+
455+
.. versionadded:: 1.4
456+
443457
Returns
444458
-------
445459
fdescr : str
446460
Content of `descr_file_name`.
447461
"""
448-
fdescr = _read_text(descr_module, descr_file_name)
449-
450-
return fdescr
462+
path = resources.files(descr_module) / descr_file_name
463+
return path.read_text(encoding=encoding)
451464

452465

453466
@validate_params(
@@ -1193,13 +1206,16 @@ def load_linnerud(*, return_X_y=False, as_frame=False):
11931206
data_filename = "linnerud_exercise.csv"
11941207
target_filename = "linnerud_physiological.csv"
11951208

1209+
data_module_path = resources.files(DATA_MODULE)
11961210
# Read header and data
1197-
with _open_text(DATA_MODULE, data_filename) as f:
1211+
data_path = data_module_path / data_filename
1212+
with data_path.open("r", encoding="utf-8") as f:
11981213
header_exercise = f.readline().split()
11991214
f.seek(0) # reset file obj
12001215
data_exercise = np.loadtxt(f, skiprows=1)
12011216

1202-
with _open_text(DATA_MODULE, target_filename) as f:
1217+
target_path = data_module_path / target_filename
1218+
with target_path.open("r", encoding="utf-8") as f:
12031219
header_physiological = f.readline().split()
12041220
f.seek(0) # reset file obj
12051221
data_physiological = np.loadtxt(f, skiprows=1)
@@ -1277,13 +1293,19 @@ def load_sample_images():
12771293
descr = load_descr("README.txt", descr_module=IMAGES_MODULE)
12781294

12791295
filenames, images = [], []
1280-
for filename in sorted(_contents(IMAGES_MODULE)):
1281-
if filename.endswith(".jpg"):
1282-
filenames.append(filename)
1283-
with _open_binary(IMAGES_MODULE, filename) as image_file:
1284-
pil_image = Image.open(image_file)
1285-
image = np.asarray(pil_image)
1286-
images.append(image)
1296+
1297+
jpg_paths = sorted(
1298+
resource
1299+
for resource in resources.files(IMAGES_MODULE).iterdir()
1300+
if resource.is_file() and resource.match("*.jpg")
1301+
)
1302+
1303+
for path in jpg_paths:
1304+
filenames.append(str(path))
1305+
with path.open("rb") as image_file:
1306+
pil_image = Image.open(image_file)
1307+
image = np.asarray(pil_image)
1308+
images.append(image)
12871309

12881310
return Bunch(images=images, filenames=filenames, DESCR=descr)
12891311

sklearn/datasets/tests/test_base.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import tempfile
44
import warnings
55
from functools import partial
6+
from importlib import resources
67
from pathlib import Path
78
from pickle import dumps, loads
89

@@ -29,7 +30,6 @@
2930
from sklearn.datasets.tests.test_common import check_as_frame
3031
from sklearn.preprocessing import scale
3132
from sklearn.utils import Bunch
32-
from sklearn.utils.fixes import _is_resource
3333

3434

3535
class _DummyPath:
@@ -291,7 +291,8 @@ def test_loader(loader_func, data_shape, target_shape, n_target, has_descr, file
291291
assert "data_module" in bunch
292292
assert all(
293293
[
294-
f in bunch and _is_resource(bunch["data_module"], bunch[f])
294+
f in bunch
295+
and (resources.files(bunch["data_module"]) / bunch[f]).is_file()
295296
for f in filenames
296297
]
297298
)

sklearn/datasets/tests/test_openml.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import re
66
from functools import partial
7+
from importlib import resources
78
from io import BytesIO
89
from urllib.error import HTTPError
910

@@ -27,7 +28,6 @@
2728
assert_array_equal,
2829
fails_if_pypy,
2930
)
30-
from sklearn.utils.fixes import _open_binary
3131

3232
OPENML_TEST_DATA_MODULE = "sklearn.datasets.tests.data.openml"
3333
# if True, urlopen will be monkey patched to only use local files
@@ -107,8 +107,9 @@ def _mock_urlopen_shared(url, has_gzip_header, expected_prefix, suffix):
107107
assert url.startswith(expected_prefix)
108108

109109
data_file_name = _file_name(url, suffix)
110+
data_file_path = resources.files(data_module) / data_file_name
110111

111-
with _open_binary(data_module, data_file_name) as f:
112+
with data_file_path.open("rb") as f:
112113
if has_gzip_header and gzip_response:
113114
fp = BytesIO(f.read())
114115
return _MockHTTPResponse(fp, True)
@@ -145,9 +146,10 @@ def _mock_urlopen_data_list(url, has_gzip_header):
145146
assert url.startswith(url_prefix_data_list)
146147

147148
data_file_name = _file_name(url, ".json")
149+
data_file_path = resources.files(data_module) / data_file_name
148150

149151
# load the file itself, to simulate a http error
150-
with _open_binary(data_module, data_file_name) as f:
152+
with data_file_path.open("rb") as f:
151153
decompressed_f = read_fn(f, "rb")
152154
decoded_s = decompressed_f.read().decode("utf-8")
153155
json_data = json.loads(decoded_s)
@@ -156,7 +158,7 @@ def _mock_urlopen_data_list(url, has_gzip_header):
156158
url=None, code=412, msg="Simulated mock error", hdrs=None, fp=None
157159
)
158160

159-
with _open_binary(data_module, data_file_name) as f:
161+
with data_file_path.open("rb") as f:
160162
if has_gzip_header:
161163
fp = BytesIO(f.read())
162164
return _MockHTTPResponse(fp, True)
@@ -1507,8 +1509,9 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir, pars
15071509
# create a temporary modified arff file
15081510
original_data_module = OPENML_TEST_DATA_MODULE + "." + f"id_{data_id}"
15091511
original_data_file_name = "data-v1-dl-1666876.arff.gz"
1512+
original_data_path = resources.files(original_data_module) / original_data_file_name
15101513
corrupt_copy_path = tmpdir / "test_invalid_checksum.arff"
1511-
with _open_binary(original_data_module, original_data_file_name) as orig_file:
1514+
with original_data_path.open("rb") as orig_file:
15121515
orig_gzip = gzip.open(orig_file, "rb")
15131516
data = bytearray(orig_gzip.read())
15141517
data[len(data) - 1] = 37

sklearn/datasets/tests/test_svmlight_format.py

Lines changed: 32 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import shutil
44
from bz2 import BZ2File
5+
from importlib import resources
56
from io import BytesIO
67
from tempfile import NamedTemporaryFile
78

@@ -17,7 +18,7 @@
1718
assert_array_equal,
1819
fails_if_pypy,
1920
)
20-
from sklearn.utils.fixes import CSR_CONTAINERS, _open_binary, _path
21+
from sklearn.utils.fixes import CSR_CONTAINERS
2122

2223
TEST_DATA_MODULE = "sklearn.datasets.tests.data"
2324
datafile = "svmlight_classification.txt"
@@ -28,11 +29,16 @@
2829
pytestmark = fails_if_pypy
2930

3031

32+
def _svmlight_local_test_file_path(filename):
33+
return resources.files(TEST_DATA_MODULE) / filename
34+
35+
3136
def _load_svmlight_local_test_file(filename, **kwargs):
3237
"""
3338
Helper to load resource `filename` with `importlib.resources`
3439
"""
35-
with _open_binary(TEST_DATA_MODULE, filename) as f:
40+
data_path = _svmlight_local_test_file_path(filename)
41+
with data_path.open("rb") as f:
3642
return load_svmlight_file(f, **kwargs)
3743

3844

@@ -76,24 +82,25 @@ def test_load_svmlight_file_fd():
7682

7783
# GH20081: testing equality between path-based and
7884
# fd-based load_svmlight_file
79-
with _path(TEST_DATA_MODULE, datafile) as data_path:
80-
data_path = str(data_path)
81-
X1, y1 = load_svmlight_file(data_path)
8285

83-
fd = os.open(data_path, os.O_RDONLY)
84-
try:
85-
X2, y2 = load_svmlight_file(fd)
86-
assert_array_almost_equal(X1.data, X2.data)
87-
assert_array_almost_equal(y1, y2)
88-
finally:
89-
os.close(fd)
86+
data_path = resources.files(TEST_DATA_MODULE) / datafile
87+
data_path = str(data_path)
88+
X1, y1 = load_svmlight_file(data_path)
89+
90+
fd = os.open(data_path, os.O_RDONLY)
91+
try:
92+
X2, y2 = load_svmlight_file(fd)
93+
assert_array_almost_equal(X1.data, X2.data)
94+
assert_array_almost_equal(y1, y2)
95+
finally:
96+
os.close(fd)
9097

9198

9299
def test_load_svmlight_pathlib():
93100
# test loading from file descriptor
94-
with _path(TEST_DATA_MODULE, datafile) as data_path:
95-
X1, y1 = load_svmlight_file(str(data_path))
96-
X2, y2 = load_svmlight_file(data_path)
101+
data_path = _svmlight_local_test_file_path(datafile)
102+
X1, y1 = load_svmlight_file(str(data_path))
103+
X2, y2 = load_svmlight_file(data_path)
97104

98105
assert_allclose(X1.data, X2.data)
99106
assert_allclose(y1, y2)
@@ -105,19 +112,16 @@ def test_load_svmlight_file_multilabel():
105112

106113

107114
def test_load_svmlight_files():
108-
with _path(TEST_DATA_MODULE, datafile) as data_path:
109-
X_train, y_train, X_test, y_test = load_svmlight_files(
110-
[str(data_path)] * 2, dtype=np.float32
111-
)
115+
data_path = _svmlight_local_test_file_path(datafile)
116+
X_train, y_train, X_test, y_test = load_svmlight_files(
117+
[str(data_path)] * 2, dtype=np.float32
118+
)
112119
assert_array_equal(X_train.toarray(), X_test.toarray())
113120
assert_array_almost_equal(y_train, y_test)
114121
assert X_train.dtype == np.float32
115122
assert X_test.dtype == np.float32
116123

117-
with _path(TEST_DATA_MODULE, datafile) as data_path:
118-
X1, y1, X2, y2, X3, y3 = load_svmlight_files(
119-
[str(data_path)] * 3, dtype=np.float64
120-
)
124+
X1, y1, X2, y2, X3, y3 = load_svmlight_files([str(data_path)] * 3, dtype=np.float64)
121125
assert X1.dtype == X2.dtype
122126
assert X2.dtype == X3.dtype
123127
assert X3.dtype == np.float64
@@ -145,7 +149,7 @@ def test_load_compressed():
145149

146150
with NamedTemporaryFile(prefix="sklearn-test", suffix=".gz") as tmp:
147151
tmp.close() # necessary under windows
148-
with _open_binary(TEST_DATA_MODULE, datafile) as f:
152+
with _svmlight_local_test_file_path(datafile).open("rb") as f:
149153
with gzip.open(tmp.name, "wb") as fh_out:
150154
shutil.copyfileobj(f, fh_out)
151155
Xgz, ygz = load_svmlight_file(tmp.name)
@@ -157,7 +161,7 @@ def test_load_compressed():
157161

158162
with NamedTemporaryFile(prefix="sklearn-test", suffix=".bz2") as tmp:
159163
tmp.close() # necessary under windows
160-
with _open_binary(TEST_DATA_MODULE, datafile) as f:
164+
with _svmlight_local_test_file_path(datafile).open("rb") as f:
161165
with BZ2File(tmp.name, "wb") as fh_out:
162166
shutil.copyfileobj(f, fh_out)
163167
Xbz, ybz = load_svmlight_file(tmp.name)
@@ -236,11 +240,9 @@ def test_load_large_qid():
236240

237241
def test_load_invalid_file2():
238242
with pytest.raises(ValueError):
239-
with (
240-
_path(TEST_DATA_MODULE, datafile) as data_path,
241-
_path(TEST_DATA_MODULE, invalidfile) as invalid_path,
242-
):
243-
load_svmlight_files([str(data_path), str(invalid_path), str(data_path)])
243+
data_path = _svmlight_local_test_file_path(datafile)
244+
invalid_path = _svmlight_local_test_file_path(invalidfile)
245+
load_svmlight_files([str(data_path), str(invalid_path), str(data_path)])
244246

245247

246248
def test_not_a_filename():

0 commit comments

Comments
 (0)
0