8000 ENH Uses gzip when caching in fetch_openml (#11830) · scikit-learn/scikit-learn@83e7375 · GitHub
[go: up one dir, main page]

Skip to content

Commit 83e7375

Browse files
thomasjpfanjnothman
authored andcommitted
ENH Uses gzip when caching in fetch_openml (#11830)
1 parent 51b1b7c commit 83e7375

File tree

2 files changed

+152
-57
lines changed

2 files changed

+152
-57
lines changed

sklearn/datasets/openml.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,18 @@
77

88
try:
99
# Python 3+
10-
from urllib.request import urlopen
10+
from urllib.request import urlopen, Request
1111
except ImportError:
1212
# Python 2
13-
from urllib2 import urlopen
13+
from urllib2 import urlopen, Request
1414

1515

1616
import numpy as np
1717
import scipy.sparse
1818

1919
from sklearn.externals import _arff
2020
from .base import get_data_home
21-
from ..externals.six import string_types, PY2
21+
from ..externals.six import string_types, PY2, BytesIO
2222
from ..externals.six.moves.urllib.error import HTTPError
2323
from ..utils import Bunch
2424

@@ -50,8 +50,18 @@ def _open_openml_url(openml_path, data_home):
5050
result : stream
5151
A stream to the OpenML resource
5252
"""
53+
req = Request(_OPENML_PREFIX + openml_path)
54+
req.add_header('Accept-encoding', 'gzip')
55+
fsrc = urlopen(req)
56+
is_gzip = fsrc.info().get('Content-Encoding', '') == 'gzip'
57+
5358
if data_home is None:
54-
return urlopen(_OPENML_PREFIX + openml_path)
59+
if is_gzip:
60+
if PY2:
61+
fsrc = BytesIO(fsrc.read())
62+
return gzip.GzipFile(fileobj=fsrc, mode='rb')
63+
return fsrc
64+
5565
local_path = os.path.join(data_home, 'openml.org', openml_path + ".gz")
5666
if not os.path.exists(local_path):
5767
try:
@@ -61,15 +71,16 @@ def _open_openml_url(openml_path, data_home):
6171
pass
6272

6373
try:
64-
with gzip.GzipFile(local_path, 'wb') as fdst:
65-
fsrc = urlopen(_OPENML_PREFIX + openml_path)
74+
with open(local_path, 'wb') as fdst:
6675
shutil.copyfileobj(fsrc, fdst)
6776
fsrc.close()
6877
except Exception:
6978
os.unlink(local_path)
7079
raise
7180
# XXX: unnecessary decompression on first access
72-
return gzip.GzipFile(local_path, 'rb')
81+
if is_gzip:
82+
return gzip.GzipFile(local_path, 'rb')
83+
return fsrc
7384

7485

7586
def _get_json_content_from_openml_api(url, error_message, raise_if_error,
@@ -308,7 +319,7 @@ def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
308319
return_type = _arff.DENSE
309320

310321
if PY2:
311-
arff_file = _arff.load(response, encode_nominal=encode_nominal,
322+
arff_file = _arff.load(response.read(), encode_nominal=encode_nominal,
312323
return_type=return_type, )
313324
else:
314325
arff_file = _arff.loads(response.read().decode('utf-8'),

0 commit comments

Comments
 (0)
0