scikit-learn · thomasjpfan · Jun 26, 2020 · Aug 24, 2019 · Aug 24, 2019 · Aug 25, 2019
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
@@ -224,6 +224,11 @@ Changelog
   :meth:`tree.DecisionTreeRegressor.fit`, and has not effect.
   :pr:`17614` by :user:`Juan Carlos Alfaro Jiménez <alfaro96>`.
 
+:mod:`sklearn.datasets`
+.......................
+- |Feature| :func:`datasets.fetch_openml` now validates md5checksum of arff
+  files downloaded or cached to ensure data integrity.
+  :pr:`14800` by :user:`Shashank Singh <shashanksingh28>` and `Joel Nothman`_.
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
@@ -2,6 +2,7 @@
 import json
 import os
 import shutil
+import hashlib
 from os.path import join
 from warnings import warn
 from contextlib import closing
@@ -492,18 +493,41 @@ def _load_arff_response(
     url: str,
     data_home: Optional[str],
     return_type, encode_nominal: bool,
-    parse_arff: Callable[[ArffContainerType], Tuple]
+    parse_arff: Callable[[ArffContainerType], Tuple],
+    md5_checksum: str
 ) -> Tuple:
     """Load arff data with url and parses arff response with parse_arff"""
     response = _open_openml_url(url, data_home)
 
     with closing(response):
         # Note that if the data is dense, no reading is done until the data
         # generator is iterated.
-        arff = _arff.load((line.decode('utf-8') for line in response),
+        actual_md5_checksum = hashlib.md5()
+
+        def _stream_checksum_generator(response):
+            for line in response:
+                actual_md5_checksum.update(line)
+                yield line.decode('utf-8')
+
+        stream = _stream_checksum_generator(response)
+
+        arff = _arff.load(stream,
                           return_type=return_type,
                           encode_nominal=encode_nominal)
-        return parse_arff(arff)
+
+        parsed_arff = parse_arff(arff)
+
+        # consume remaining stream, if early exited
+        for _ in stream:
+            pass
+
+        if actual_md5_checksum.hexdigest() != md5_checksum:
+            raise ValueError("md5 checksum of local file for " + url +
+                             " does not match description. "
+                             "Downloaded file could have been modified / "
+                             "corrupted, clean cache and retry...")
+
+        return parsed_arff
 
 
 def _download_data_to_bunch(
@@ -515,7 +539,8 @@ def _download_data_to_bunch(
     features_list: List,
     data_columns: List[int],
     target_columns: List,
-    shape: Optional[Tuple[int, int]]
+    shape: Optional[Tuple[int, int]],
+    md5_checksum: str
 ):
     """Download OpenML ARFF and convert to Bunch of data
     """
@@ -609,7 +634,8 @@ def postprocess(X, y, nominal_attributes):
         _load_arff_response)(url, data_home,
                              return_type=return_type,
                              encode_nominal=not as_frame,
-                             parse_arff=parse_arff)
+                             parse_arff=parse_arff,
+                             md5_checksum=md5_checksum)
     X, y, frame, nominal_attributes = postprocess(*out)
 
     return Bunch(data=X, target=y, frame=frame,
@@ -883,7 +909,9 @@ def fetch_openml(
                                     as_frame=as_frame,
                                     features_list=features_list, shape=shape,
                                     target_columns=target_columns,
-                                    data_columns=data_columns)
+                                    data_columns=data_columns,
+                                    md5_checksum=data_description[
+                                        "md5_checksum"])
 
     if return_X_y:
         return bunch.data, bunch.target

diff --git a/sklearn/datasets/tests/data/openml/1/api-v1-json-data-1.json.gz b/sklearn/datasets/tests/data/openml/1/api-v1-json-data-1.json.gz
diff --git a/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-1119.json.gz b/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-1119.json.gz
diff --git a/sklearn/datasets/tests/data/openml/2/api-v1-json-data-2.json.gz b/sklearn/datasets/tests/data/openml/2/api-v1-json-data-2.json.gz
diff --git a/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz b/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz
diff --git a/sklearn/datasets/tests/data/openml/292/api-v1-json-data-40981.json.gz b/sklearn/datasets/tests/data/openml/292/api-v1-json-data-40981.json.gz
diff --git a/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-40589.json.gz b/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-40589.json.gz
diff --git a/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-40675.json.gz b/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-40675.json.gz
diff --git a/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-40966.json.gz b/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-40966.json.gz
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
@@ -149,6 +149,32 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
     return data_by_id
 
 
+class _MockHTTPResponse:
+    def __init__(self, data, is_gzip):
+        self.data = data
+        self.is_gzip = is_gzip
+
+    def read(self, amt=-1):
+        return self.data.read(amt)
+
+    def close(self):
+        self.data.close()
+
+    def info(self):
+        if self.is_gzip:
+            return {'Content-Encoding': 'gzip'}
+        return {}
+
+    def __iter__(self):
+        return iter(self.data)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        return False
+
+
 def _monkey_patch_webbased_functions(context,
                                      data_id,
                                      gzip_response):
@@ -163,37 +189,6 @@ def _monkey_patch_webbased_functions(context,
     path_suffix = '.gz'
     read_fn = gzip.open
 
-    class MockHTTPResponse:
-        def __init__(self, data, is_gzip):
-            self.data = data
-            self.is_gzip = is_gzip
-
-        def read(self, amt=-1):
-            return self.data.read(amt)
-
-        def tell(self):
-            return self.data.tell()
-
-        def seek(self, pos, whence=0):
-            return self.data.seek(pos, whence)
-
-        def close(self):
-            self.data.close()
-
-        def info(self):
-            if self.is_gzip:
-                return {'Content-Encoding': 'gzip'}
-            return {}
-
-        def __iter__(self):
-            return iter(self.data)
-
-        def __enter__(self):
-            return self
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            return False
-
     def _file_name(url, suffix):
         return (re.sub(r'\W', '-', url[len("https://openml.org/"):])
                 + suffix + path_suffix)
@@ -206,21 +201,21 @@ def _mock_urlopen_data_description(url, has_gzip_header):
 
         if has_gzip_header and gzip_response:
             fp = open(path, 'rb')
-            return MockHTTPResponse(fp, True)
+            return _MockHTTPResponse(fp, True)
         else:
             fp = read_fn(path, 'rb')
-            return MockHTTPResponse(fp, False)
+            return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen_data_features(url, has_gzip_header):
         assert url.startswith(url_prefix_data_features)
         path = os.path.join(currdir, 'data', 'openml', str(data_id),
                             _file_name(url, '.json'))
         if has_gzip_header and gzip_response:
             fp = open(path, 'rb')
-            return MockHTTPResponse(fp, True)
+            return _MockHTTPResponse(fp, True)
         else:
             fp = read_fn(path, 'rb')
-            return MockHTTPResponse(fp, False)
+            return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen_download_data(url, has_gzip_header):
         assert (url.startswith(url_prefix_download_data))
@@ -230,10 +225,10 @@ def _mock_urlopen_download_data(url, has_gzip_header):
 
         if has_gzip_header and gzip_response:
             fp = open(path, 'rb')
-            return MockHTTPResponse(fp, True)
+            return _MockHTTPResponse(fp, True)
         else:
             fp = read_fn(path, 'rb')
-            return MockHTTPResponse(fp, False)
+            return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen_data_list(url, has_gzip_header):
         assert url.startswith(url_prefix_data_list)
@@ -250,10 +245,10 @@ def _mock_urlopen_data_list(url, has_gzip_header):
 
         if has_gzip_header:
             fp = open(json_file_path, 'rb')
-            return MockHTTPResponse(fp, True)
+            return _MockHTTPResponse(fp, True)
         else:
             fp = read_fn(json_file_path, 'rb')
-            return MockHTTPResponse(fp, False)
+            return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen(request):
         url = request.get_full_url()
@@ -1209,6 +1204,47 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
     assert 'animal' not in dataset['feature_names']
 
 
+@pytest.mark.parametrize('as_frame', [True, False])
+def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir):
+    if as_frame:
+        pytest.importorskip('pandas')
+
+    data_id = 2
+    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
+
+    # create a temporary modified arff file
+    dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id))
+    original_data_path = os.path.join(dataset_dir,
+                                      'data-v1-download-1666876.arff.gz')
+    corrupt_copy = os.path.join(tmpdir, "test_invalid_checksum.arff")
+    with gzip.GzipFile(original_data_path, "rb") as orig_gzip, \
+            gzip.GzipFile(corrupt_copy, "wb") as modified_gzip:
+        data = bytearray(orig_gzip.read())
+        data[len(data)-1] = 37
+        modified_gzip.write(data)
+
+    # Requests are already mocked by monkey_patch_webbased_functions.
+    # We want to re-use that mock for all requests except file download,
+    # hence creating a thin mock over the original mock
+    mocked_openml_url = sklearn.datasets._openml.urlopen
+
+    def swap_file_mock(request):
+        url = request.get_full_url()
+        if url.endswith('data/v1/download/1666876'):
+            return _MockHTTPResponse(open(corrupt_copy, "rb"), is_gzip=True)
+        else:
+            return mocked_openml_url(request)
+
+    monkeypatch.setattr(sklearn.datasets._openml, 'urlopen', swap_file_mock)
+
+    # validate failed checksum
+    with pytest.raises(ValueError) as exc:
+        sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
+                                      as_frame=as_frame)
+    # exception message should have file-path
+    assert exc.match("1666876")
+
+
 def test_convert_arff_data_type():
     pytest.importorskip('pandas')