2
2
import json
3
3
import os
4
4
import shutil
5
+ import hashlib
5
6
from os .path import join
6
7
from warnings import warn
7
8
from contextlib import closing
@@ -492,18 +493,41 @@ def _load_arff_response(
492
493
url : str ,
493
494
data_home : Optional [str ],
494
495
return_type , encode_nominal : bool ,
495
- parse_arff : Callable [[ArffContainerType ], Tuple ]
496
+ parse_arff : Callable [[ArffContainerType ], Tuple ],
497
+ md5_checksum : str
496
498
) -> Tuple :
497
499
"""Load arff data with url and parses arff response with parse_arff"""
498
500
response = _open_openml_ur
6D40
l (url , data_home )
499
501
500
502
with closing (response ):
501
503
# Note that if the data is dense, no reading is done until the data
502
504
# generator is iterated.
503
- arff = _arff .load ((line .decode ('utf-8' ) for line in response ),
505
+ actual_md5_checksum = hashlib .md5 ()
506
+
507
+ def _stream_checksum_generator (response ):
508
+ for line in response :
509
+ actual_md5_checksum .update (line )
510
+ yield line .decode ('utf-8' )
511
+
512
+ stream = _stream_checksum_generator (response )
513
+
514
+ arff = _arff .load (stream ,
504
515
return_type = return_type ,
505
516
encode_nominal = encode_nominal )
506
- return parse_arff (arff )
517
+
518
+ parsed_arff = parse_arff (arff )
519
+
520
+ # consume remaining stream, if early exited
521
+ for _ in stream :
522
+ pass
523
+
524
+ if actual_md5_checksum .hexdigest () != md5_checksum :
525
+ raise ValueError ("md5 checksum of local file for " + url +
526
+ " does not match description. "
527
+ "Downloaded file could have been modified / "
528
+ "corrupted, clean cache and retry..." )
529
+
530
+ return parsed_arff
507
531
508
532
509
533
def _download_data_to_bunch (
@@ -515,7 +539,8 @@ def _download_data_to_bunch(
515
539
features_list : List ,
516
540
data_columns : List [int ],
517
541
target_columns : List ,
518
- shape : Optional [Tuple [int , int ]]
542
+ shape : Optional [Tuple [int , int ]],
543
+ md5_checksum : str
519
544
):
520
545
"""Download OpenML ARFF and convert to Bunch of data
521
546
"""
@@ -609,7 +634,8 @@ def postprocess(X, y, nominal_attributes):
609
634
_load_arff_response )(url , data_home ,
610
635
return_type = return_type ,
611
636
encode_nominal = not as_frame ,
612
- parse_arff = parse_arff )
637
+ parse_arff = parse_arff ,
638
+ md5_checksum = md5_checksum )
613
639
X , y , frame , nominal_attributes = postprocess (* out )
614
640
615
641
return Bunch (data = X , target = y , frame = frame ,
@@ -883,7 +909,9 @@ def fetch_openml(
883
909
as_frame = as_frame ,
884
910
features_list = features_list , shape = shape ,
885
911
target_columns = target_columns ,
886
- data_columns = data_columns )
912
+ data_columns = data_columns ,
913
+ md5_checksum = data_description [
914
+ "md5_checksum" ])
887
915
888
916
if return_X_y :
889
917
return bunch .data , bunch .target
0 commit comments