8000 read and update md5 in chunks · scikit-learn/scikit-learn@2b18909 · GitHub
[go: up one dir, main page]

Skip to content

Commit 2b18909

Browse files
read and update md5 in chunks
1 parent 831d78b commit 2b18909

File tree

1 file changed

+42
-21
lines changed

1 file changed

+42
-21
lines changed

sklearn/datasets/openml.py

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -86,38 +86,59 @@ def is_gzip(_fsrc):
8686
req = Request(_OPENML_PREFIX + openml_path)
8787
req.add_header('Accept-encoding', 'gzip')
8888

89-
def _md5_validated_bytes(bytes_content, md5_checksum):
89+
def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512):
9090
"""
91-
Consume binary stream to validate checksum,
92-
return a new stream with same content
91+
Takes in a byte-stream, reads in chunks and returns bytes.
92+
If expected_md5 is not none, keeps md5 checksum state while streaming
93+
and validates post stream consumption.
9394
9495
Parameters
9596
----------
96-
bytes_content : bytes
9797
98-
md5_checksum: str
99-
Expected md5 checksum of bytes
98+
fsrc : io.BufferedIOBase
99+
input stream to read bytes from
100+
101+
expected_md5 : str
102+
expected md5-checksum value
103+
104+
chunk_size : int
105+
size of chunks to read at a time from stream
100106
101107
Returns
102108
-------
103-
bytes
109+
fsrc_bytes : bytes
110+
equivalent to fsrc_bytes.read() but with md5 validation if
111+
expected_md5 is provided
112+
113+
Raises
114+
------
115+
116+
ValueError
104117
"""
105-
actual_md5_checksum = hashlib.md5(bytes_content).hexdigest()
106-
if md5_checksum != actual_md5_checksum:
118+
fsrc_bytes = bytes()
119+
file_md5 = hashlib.md5() if expected_md5_checksum else None
120+
while True:
121+
data = fsrc.read(chunk_size)
122+
if not data:
123+
break
124+
if expected_md5:
125+
file_md5.update(data)
126+
fsrc_bytes += data
127+
128+
if expected_md5 and file_md5.hexdigest() != expected_md5:
107129
raise ValueError("md5checksum: {} does not match expected: "
108-
"{}".format(actual_md5_checksum,
109-
md5_checksum))
110-
return bytes_content
130+
"{}".format(file_md5.hexdigest(),
131+
expected_md5))
132+
return fsrc_bytes
111133

112134
if data_home is None:
113135
fsrc = urlopen(req)
114136
if is_gzip(fsrc):
115137
fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
116-
bytes_content = fsrc.read()
117-
if expected_md5_checksum:
118-
# validating checksum reads and consumes the stream
119-
return _md5_validated_bytes(bytes_content, expected_md5_checksum)
120-
return bytes_content
138+
return _md5_validated_bytestream(
139+
fsrc,
140+
expected_md5=expected_md5_checksum
141+
)
121142

122143
local_path = _get_local_path(openml_path, data_home)
123144
if not os.path.exists(local_path):
@@ -131,10 +152,10 @@ def _md5_validated_bytes(bytes_content, md5_checksum):
131152
with closing(urlopen(req)) as fsrc:
132153
if is_gzip(fsrc): # unzip it for checksum validation
133154
fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
134-
bytes_content = fsrc.read()
135-
if expected_md5_checksum:
136-
bytes_content = _md5_validated_bytes(bytes_content,
137-
expected_md5_checksum)
155+
bytes_content = _md5_validated_bytestream(
156+
fsrc,
157+
expected_md5=expected_md5_checksum
158+
)
138159
with gzip.GzipFile(local_path, 'wb') as fdst:
139160
fdst.write(bytes_content)
140161
except Exception:

0 commit comments

Comments
 (0)
0