8000 ENH Verify md5-checksums received from openml arff file metadata(#14800) · scikit-learn/scikit-learn@1e08459 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1e08459

Browse files
shashanksingh28thomasjpfanrth
authored
ENH Verify md5-checksums received from openml arff file metadata(#14800)
Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com> Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com> Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
1 parent b0c03d1 commit 1e08459

11 files changed

+114
-45
lines changed

doc/whats_new/v0.24.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,11 @@ Changelog
224224
:meth:`tree.DecisionTreeRegressor.fit`, and has not effect.
225225
:pr:`17614` by :user:`Juan Carlos Alfaro Jiménez <alfaro96>`.
226226

227+
:mod:`sklearn.datasets`
228+
.......................
229+
- |Feature| :func:`datasets.fetch_openml` now validates md5checksum of arff
230+
files downloaded or cached to ensure data integrity.
231+
:pr:`14800` by :user:`Shashank Singh <shashanksingh28>` and `Joel Nothman`_.
227232

228233
Code and Documentation Contributors
229234
-----------------------------------

sklearn/datasets/_openml.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import os
44
import shutil
5+
import hashlib
56
from os.path import join
67
from warnings import warn
78
from contextlib import closing
@@ -492,18 +493,41 @@ def _load_arff_response(
492493
url: str,
493494
data_home: Optional[str],
494495
return_type, encode_nominal: bool,
495-
parse_arff: Callable[[ArffContainerType], Tuple]
496+
parse_arff: Callable[[ArffContainerType], Tuple],
497+
md5_checksum: str
496498
) -> Tuple:
497499
"""Load arff data with url and parses arff response with parse_arff"""
498500
response = _open_openml_ur 6D40 l(url, data_home)
499501

500502
with closing(response):
501503
# Note that if the data is dense, no reading is done until the data
502504
# generator is iterated.
503-
arff = _arff.load((line.decode('utf-8') for line in response),
505+
actual_md5_checksum = hashlib.md5()
506+
507+
def _stream_checksum_generator(response):
508+
for line in response:
509+
actual_md5_checksum.update(line)
510+
yield line.decode('utf-8')
511+
512+
stream = _stream_checksum_generator(response)
513+
514+
arff = _arff.load(stream,
504515
return_type=return_type,
505516
encode_nominal=encode_nominal)
506-
return parse_arff(arff)
517+
518+
parsed_arff = parse_arff(arff)
519+
520+
# consume remaining stream, if early exited
521+
for _ in stream:
522+
pass
523+
524+
if actual_md5_checksum.hexdigest() != md5_checksum:
525+
raise ValueError("md5 checksum of local file for " + url +
526+
" does not match description. "
527+
"Downloaded file could have been modified / "
528+
"corrupted, clean cache and retry...")
529+
530+
return parsed_arff
507531

508532

509533
def _download_data_to_bunch(
@@ -515,7 +539,8 @@ def _download_data_to_bunch(
515539
features_list: List,
516540
data_columns: List[int],
517541
target_columns: List,
518-
shape: Optional[Tuple[int, int]]
542+
shape: Optional[Tuple[int, int]],
543+
md5_checksum: str
519544
):
520545
"""Download OpenML ARFF and convert to Bunch of data
521546
"""
@@ -609,7 +634,8 @@ def postprocess(X, y, nominal_attributes):
609634
_load_arff_response)(url, data_home,
610635
return_type=return_type,
611636
encode_nominal=not as_frame,
612-
parse_arff=parse_arff)
637+
parse_arff=parse_arff,
638+
md5_checksum=md5_checksum)
613639
X, y, frame, nominal_attributes = postprocess(*out)
614640

615641
return Bunch(data=X, target=y, frame=frame,
@@ -883,7 +909,9 @@ def fetch_openml(
883909
as_frame=as_frame,
884910
features_list=features_list, shape=shape,
885911
target_columns=target_columns,
886-
data_columns=data_columns)
912+
data_columns=data_columns,
913+
md5_checksum=data_description[
914+
"md5_checksum"])
887915

888916
if return_X_y:
889917
return bunch.data, bunch.target
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)
0