@@ -86,38 +86,59 @@ def is_gzip(_fsrc):
86
86
req = Request (_OPENML_PREFIX + openml_path )
87
87
req .add_header ('Accept-encoding' , 'gzip' )
88
88
89
- def _md5_validated_bytes ( bytes_content , md5_checksum ):
89
+ def _md5_validated_bytestream ( fsrc , expected_md5 = None , chunk_size = 512 ):
90
90
"""
91
- Consume binary stream to validate checksum,
92
- return a new stream with same content
91
+ Takes in a byte-stream, reads in chunks and returns bytes.
92
+ If expected_md5 is not none, keeps md5 checksum state while streaming
93
+ and validates post stream consumption.
93
94
94
95
Parameters
95
96
----------
96
- bytes_content : bytes
97
97
98
- md5_checksum: str
99
- Expected md5 checksum of bytes
98
+ fsrc : io.BufferedIOBase
99
+ input stream to read bytes from
100
+
101
+ expected_md5 : str
102
+ expected md5-checksum value
103
+
104
+ chunk_size : int
105
+ size of chunks to read at a time from stream
100
106
101
107
Returns
102
108
-------
103
- bytes
109
+ fsrc_bytes : bytes
110
+ equivalent to fsrc_bytes.read() but with md5 validation if
111
+ expected_md5 is provided
112
+
113
+ Raises
114
+ ------
115
+
116
+ ValueError
104
117
"""
105
- actual_md5_checksum = hashlib .md5 (bytes_content ).hexdigest ()
106
- if md5_checksum != actual_md5_checksum :
118
+ fsrc_bytes = bytes ()
119
+ file_md5 = hashlib .md5 () if expected_md5_checksum else None
120
+ while True :
121
+ data = fsrc .read (chunk_size )
122
+ if not data :
123
+ break
124
+ if expected_md5 :
125
+ file_md5 .update (data )
126
+ fsrc_bytes += data
127
+
128
+ if expected_md5 and file_md5 .hexdigest () != expected_md5 :
107
129
raise ValueError ("md5checksum: {} does not match expected: "
108
- "{}" .format (actual_md5_checksum ,
109
- md5_checksum ))
110
- return bytes_content
130
+ "{}" .format (file_md5 . hexdigest () ,
131
+ expected_md5 ))
132
+ return fsrc_bytes
111
133
112
134
if data_home is None :
113
135
fsrc = urlopen (req )
114
136
if is_gzip (fsrc ):
115
137
fsrc = gzip .GzipFile (fileobj = fsrc , mode = 'rb' )
116
- bytes_content = fsrc .read ()
117
- if expected_md5_checksum :
118
- # validating checksum reads and consumes the stream
119
- return _md5_validated_bytes (bytes_content , expected_md5_checksum )
120
- return bytes_content
138
+ return _md5_validated_bytestream (
139
+ fsrc ,
140
+ expected_md5 = expected_md5_checksum
141
+ )
121
142
122
143
local_path = _get_local_path (openml_path , data_home )
123
144
if not os .path .exists (local_path ):
@@ -131,10 +152,10 @@ def _md5_validated_bytes(bytes_content, md5_checksum):
131
152
with closing (urlopen (req )) as fsrc :
132
153
if is_gzip (fsrc ): # unzip it for checksum validation
133
154
fsrc = gzip .GzipFile (fileobj = fsrc , mode = 'rb' )
134
- bytes_content = fsrc . read ()
135
- if expected_md5_checksum :
136
- bytes_content = _md5_validated_bytes ( bytes_content ,
137
- expected_md5_checksum )
155
+ bytes_content = _md5_validated_bytestream (
156
+ fsrc ,
157
+ expected_md5 = expected_md5_checksum
158
+ )
138
159
with gzip .GzipFile (local_path , 'wb' ) as fdst :
139
160
fdst .write (bytes_content )
140
161
except Exception :
0 commit comments