1
1
import gzip
2
2
import json
3
3
import os
4
- from io import BytesIO
5
4
import hashlib
6
- import shutil
7
5
from os .path import join
8
6
from warnings import warn
9
7
from contextlib import closing
@@ -63,7 +61,7 @@ def wrapper():
63
61
return decorator
64
62
65
63
66
- def _open_openml_url (openml_path , data_home , expected_md5_checksum = None ):
64
+ def _openml_url_bytes (openml_path , data_home , expected_md5_checksum = None ):
67
65
"""
68
66
Returns a resource from OpenML.org. Caches it to data_home if required.
69
67
@@ -79,49 +77,47 @@ def _open_openml_url(openml_path, data_home, expected_md5_checksum=None):
79
77
80
78
Returns
81
79
-------
82
- result : stream
83
- A stream to the OpenML resource
80
+ result : bytes
81
+ Byte content of resource
84
82
"""
85
83
def is_gzip (_fsrc ):
86
84
return _fsrc .info ().get ('Content-Encoding' , '' ) == 'gzip'
87
85
88
86
req = Request (_OPENML_PREFIX + openml_path )
89
87
req .add_header ('Accept-encoding' , 'gzip' )
90
88
91
- def _md5_validated_stream ( input_stream , md5_checksum ):
89
+ def _md5_validated_bytes ( bytes_content , md5_checksum ):
92
90
"""
93
91
Consume binary stream to validate checksum,
94
92
return a new stream with same content
95
93
96
94
Parameters
97
95
----------
98
- input_stream : io.BufferedIOBase
99
- Input stream with a read() method to get content in bytes
96
+ bytes_content : bytes
100
97
101
98
md5_checksum: str
102
- Expected md5 checksum
99
+ Expected md5 checksum of bytes
103
100
104
101
Returns
105
102
-------
106
- BytesIO stream with the same content as input_stream for consumption
103
+ bytes
107
104
"""
108
- with closing (input_stream ):
109
- bytes_content = input_stream .read ()
110
- actual_md5_checksum = hashlib .md5 (bytes_content ).hexdigest ()
111
- if md5_checksum != actual_md5_checksum :
112
- raise ValueError ("md5checksum: {} does not match expected: "
113
- "{}" .format (actual_md5_checksum ,
114
- md5_checksum ))
115
- return BytesIO (bytes_content )
105
+ actual_md5_checksum = hashlib .md5 (bytes_content ).hexdigest ()
106
+ if md5_checksum != actual_md5_checksum :
107
+ raise ValueError ("md5checksum: {} does not match expected: "
108
+ "{}" .format (actual_md5_checksum ,
109
+ md5_checksum ))
110
+ return bytes_content
116
111
117
112
if data_home is None :
118
113
fsrc = urlopen (req )
119
114
if is_gzip (fsrc ):
120
115
fsrc = gzip .GzipFile (fileobj = fsrc , mode = 'rb' )
116
+ bytes_content = fsrc .read ()
121
117
if expected_md5_checksum :
122
118
# validating checksum reads and consumes the stream
123
- return _md5_validated_stream ( fsrc , expected_md5_checksum )
124
- return fsrc
119
+ return _md5_validated_bytes ( bytes_content , expected_md5_checksum )
120
+ return bytes_content
125
121
126
122
local_path = _get_local_path (openml_path , data_home )
127
123
if not os .path .exists (local_path ):
@@ -135,18 +131,23 @@ def _md5_validated_stream(input_stream, md5_checksum):
135
131
with closing (urlopen (req )) as fsrc :
136
132
if is_gzip (fsrc ): # unzip it for checksum validation
137
133
fsrc = gzip .GzipFile (fileobj = fsrc , mode = 'rb' )
134
+ bytes_content = fsrc .read ()
138
135
if expected_md5_checksum :
139
- fsrc = _md5_validated_stream (fsrc , expected_md5_checksum )
136
+ by
10000
tes_content = _md5_validated_bytes (bytes_content ,
137
+ expected_md5_checksum )
140
138
with gzip .GzipFile (local_path , 'wb' ) as fdst :
141
- shutil . copyfileobj ( fsrc , fdst )
139
+ fdst . write ( bytes_content )
142
140
except Exception :
143
141
if os .path .exists (local_path ):
144
142
os .unlink (local_path )
145
143
raise
144
+ else :
145
+ with gzip .GzipFile (local_path , "rb" ) as gzip_file :
146
+ bytes_content = gzip_file .read ()
146
147
147
148
# XXX: First time, decompression will not be necessary (by using fsrc), but
148
149
# it will happen nonetheless
149
- return gzip . GzipFile ( local_path , 'rb' )
150
+ return bytes_content
150
151
151
152
152
153
def _get_json_content_from_openml_api (url , error_message , raise_if_error ,
@@ -183,8 +184,7 @@ def _get_json_content_from_openml_api(url, error_message, raise_if_error,
183
184
184
185
@_retry_with_clean_cache (url , data_home )
185
186
def _load_json ():
186
- with closing (_open_openml_url (url , data_home )) as response :
187
- return json .loads (response .read ().decode ("utf-8" ))
187
+ return json .loads (_openml_url_bytes (url , data_home ).decode ("utf-8" ))
188
188
189
189
try :
190
190
return _load_json ()
@@ -489,16 +489,16 @@ def _download_data_arff(file_id, sparse, data_home, encode_nominal=True,
489
489
490
490
@_retry_with_clean_cache (url , data_home )
491
491
def _arff_load ():
492
- with closing ( _open_openml_url ( url , data_home , expected_md5_checksum )) \
493
- as response :
494
- if sparse is True :
495
- return_type = _arff .COO
496
- else :
497
- return_type = _arff .DENSE_GEN
498
-
499
- arff_file = _arff .loads (response . read () .decode ('utf-8' ),
500
- encode_nominal = encode_nominal ,
501
- return_type = return_type )
492
+ bytes_content = _openml_url_bytes ( url , data_home ,
493
+ expected_md5_checksum )
494
+ if sparse is True :
495
+ return_type = _arff .COO
496
+ else :
497
+ return_type = _arff .DENSE_GEN
498
+
499
+ arff_file = _arff .loads (bytes_content .decode ('utf-8' ),
500
+ encode_nominal = encode_nominal ,
501
+ return_type = return_type )
502
502
return arff_file
503
503
504
504
return _arff_load ()
0 commit comments