8000 ENH Verify md5-checksums received from openml arff file metadata by shashanksingh28 · Pull Request #14800 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

ENH Verify md5-checksums received from openml arff file metadata #14800

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
5e9f6c9
add verify_checksum functionality with tests
shashanksingh28 Aug 24, 2019
5df30c0
python3.5 compatible multi-line string
shashanksingh28 Aug 24, 2019
df4c049
use titanic local file and format string
shashanksingh28 Aug 25, 2019
f4ca32b
update locally truncated arff md5sums
shashanksingh28 Aug 25, 2019
831d78b
return bytes instead of stream, read once
shashanksingh28 Sep 10, 2019
10ecf9a
read and update md5 in chunks
shashanksingh28 Sep 30, 2019
f8c8fe4
bytearray extend while chunked construction
shashanksingh28 Oct 11, 2019
3c5ab3e
Update sklearn/datasets/openml.py
shashanksingh28 Oct 11, 2019
f2624e0
Merge branch 'upstream_master' into md5checksum_validate_openml
shashanksingh28 Nov 17, 2019
db159a5
add early exiting
shashanksingh28 Dec 3, 2019
f745e21
revert back to simple case
shashanksingh28 Jan 4, 2020
3433031
Merge upstream, verify checksum while yielding stream
shashanksingh28 May 3, 2020
c7774ef
Update checksum for truncated local test files
shashanksingh28 May 3, 2020
1a6b873
Merge branch 'upstream_master' into md5checksum_validate_openml
shashanksingh28 May 3, 2020
d00a413
Linting updates
shashanksingh28 May 3, 2020
2e744aa
Fully consume generator, test for non frame case
shashanksingh28 May 4, 2020
1e6efdd
Cross platform assert in test
shashanksingh28 May 4, 2020
bf937a3
Intentionally reach end-of-stream checksum validation
shashanksingh28 May 25, 2020
eac5a1e
Test should not modify local test-suite shared file
shashanksingh28 Jun 6, 2020
d075a83
Update sklearn/datasets/tests/test_openml.py
shashanksingh28 Jun 7, 2020
23ba190
Use tmpdir for creating corrupt file, add comments, update changelog
shashanksingh28 Jun 7, 2020
9bda995
Merge branch 'md5checksum_validate_openml' of github.com:shashanksing…
shashanksingh28 Jun 7, 2020
4ab9036
Merge upstream master
shashanksingh28 Jun 7, 2020
c216171
Make test-path platform independent
shashanksingh28 Jun 7, 2020
536bc4f
Do not remove file explicitly from tmpdir
shashanksingh28 Jun 7, 2020
c55f64a
Make test mock class private to ignore coverage
shashanksingh28 Jun 7, 2020
c12fd02
Merge branch 'upstream_master' into md5checksum_validate_openml
shashanksingh28 Jun 7, 2020
02729a4
Merge branch 'master' into md5checksum_validate_openml
rth Jun 25, 2020
fc9181d
Fix merge conflict issues
rth Jun 25, 2020
07a53b6
fmt
rth Jun 25, 2020
d5bebcf
CLN Early skip if pandas is not avaliable
thomasjpfan Jun 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v0.24.rst
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,11 @@ Changelog
:meth:`tree.DecisionTreeRegressor.fit`, and has not effect.
:pr:`17614` by :user:`Juan Carlos Alfaro Jiménez <alfaro96>`.

:mod:`sklearn.datasets`
.......................
- |Feature| :func:`datasets.fetch_openml` now validates md5checksum of arff
files downloaded or cached to ensure data integrity.
:pr:`14800` by :user:`Shashank Singh <shashanksingh28>` and `Joel Nothman`_.

Code and Documentation Contributors
-----------------------------------
Expand Down
40 changes: 34 additions & 6 deletions sklearn/datasets/_openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import os
import shutil
import hashlib
from os.path import join
from warnings import warn
from contextlib import closing
Expand Down Expand Up @@ -492,18 +493,41 @@ def _load_arff_response(
url: str,
data_home: Optional[str],
return_type, encode_nominal: bool,
parse_arff: Callable[[ArffContainerType], Tuple]
parse_arff: Callable[[ArffContainerType], Tuple],
md5_checksum: str
) -> Tuple:
"""Load arff data with url and parses arff response with parse_arff"""
response = _open_openml_url(url, data_home)

with closing(response):
# Note that if the data is dense, no reading is done until the data
# generator is iterated.
arff = _arff.load((line.decode('utf-8') for line in response),
actual_md5_checksum = hashlib.md5()

def _stream_checksum_generator(response):
for line in response:
actual_md5_checksum.update(line)
yield line.decode('utf-8')

stream = _stream_checksum_generator(response)

arff = _arff.load(stream,
return_type=return_type,
encode_nominal=encode_nominal)
return parse_arff(arff)

parsed_arff = parse_arff(arff)

# consume remaining stream, if early exited
for _ in stream:
pass

if actual_md5_checksum.hexdigest() != md5_checksum:
raise ValueError("md5 checksum of local file for " + url +
" does not match description. "
"Downloaded file could have been modified / "
"corrupted, clean cache and retry...")

return parsed_arff


def _download_data_to_bunch(
Expand All @@ -515,7 +539,8 @@ def _download_data_to_bunch(
features_list: List,
data_columns: List[int],
target_columns: List,
shape: Optional[Tuple[int, int]]
shape: Optional[Tuple[int, int]],
md5_checksum: str
):
"""Download OpenML ARFF and convert to Bunch of data
"""
Expand Down Expand Up @@ -609,7 +634,8 @@ def postprocess(X, y, nominal_attributes):
_load_arff_response)(url, data_home,
return_type=return_type,
encode_nominal=not as_frame,
parse_arff=parse_arff)
parse_arff=parse_arff,
md5_checksum=md5_checksum)
X, y, frame, nominal_attributes = postprocess(*out)

return Bunch(data=X, target=y, frame=frame,
Expand Down Expand Up @@ -883,7 +909,9 @@ def fetch_openml(
as_frame=as_frame,
features_list=features_list, shape=shape,
target_columns=target_columns,
data_columns=data_columns)
data_columns=data_columns,
md5_checksum=data_description[
"md5_checksum"])

if return_X_y:
return bunch.data, bunch.target
Expand Down
Binary file modified sklearn/datasets/tests/data/openml/1/api-v1-json-data-1.json.gz
Binary file not shown.
Binary file not shown.
Binary file modified sklearn/datasets/tests/data/openml/2/api-v1-json-data-2.json.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
114 changes: 75 additions & 39 deletions sklearn/datasets/tests/test_openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,32 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
return data_by_id


class _MockHTTPResponse:
def __init__(self, data, is_gzip):
self.data = data
self.is_gzip = is_gzip

def read(self, amt=-1):
return self.data.read(amt)

def close(self):
self.data.close()

def info(self):
if self.is_gzip:
return {'Content-Encoding': 'gzip'}
return {}

def __iter__(self):
return iter(self.data)

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
return False


def _monkey_patch_webbased_functions(context,
data_id,
gzip_response):
Expand All @@ -163,37 +189,6 @@ def _monkey_patch_webbased_functions(context,
path_suffix = '.gz'
read_fn = gzip.open

class MockHTTPResponse:
def __init__(self, data, is_gzip):
self.data = data
self.is_gzip = is_gzip

def read(self, amt=-1):
return self.data.read(amt)

def tell(self):
return self.data.tell()

def seek(self, pos, whence=0):
return self.data.seek(pos, whence)

def close(self):
self.data.close()

def info(self):
if self.is_gzip:
return {'Content-Encoding': 'gzip'}
return {}

def __iter__(self):
return iter(self.data)

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
return False

def _file_name(url, suffix):
return (re.sub(r'\W', '-', url[len("https://openml.org/"):])
+ suffix + path_suffix)
Expand All @@ -206,21 +201,21 @@ def _mock_urlopen_data_description(url, has_gzip_header):

if has_gzip_header and gzip_response:
10000 fp = open(path, 'rb')
return MockHTTPResponse(fp, True)
return _MockHTTPResponse(fp, True)
else:
fp = read_fn(path, 'rb')
return MockHTTPResponse(fp, False)
return _MockHTTPResponse(fp, False)

def _mock_urlopen_data_features(url, has_gzip_header):
assert url.startswith(url_prefix_data_features)
path = os.path.join(currdir, 'data', 'openml', str(data_id),
_file_name(url, '.json'))
if has_gzip_header and gzip_response:
fp = open(path, 'rb')
return MockHTTPResponse(fp, True)
return _MockHTTPResponse(fp, True)
else:
fp = read_fn(path, 'rb')
return MockHTTPResponse(fp, False)
return _MockHTTPResponse(fp, False)

def _mock_urlopen_download_data(url, has_gzip_header):
assert (url.startswith(url_prefix_download_data))
Expand All @@ -230,10 +225,10 @@ def _mock_urlopen_download_data(url, has_gzip_header):

if has_gzip_header and gzip_response:
fp = open(path, 'rb')
return MockHTTPResponse(fp, True)
return _MockHTTPResponse(fp, True)
else:
fp = read_fn(path, 'rb')
return MockHTTPResponse(fp, False)
return _MockHTTPResponse(fp, False)

def _mock_urlopen_data_list(url, has_gzip_header):
assert url.startswith(url_prefix_data_list)
Expand All @@ -250,10 +245,10 @@ def _mock_urlopen_data_list(url, has_gzip_header):

if has_gzip_header:
fp = open(json_file_path, 'rb')
return MockHTTPResponse(fp, True)
return _MockHTTPResponse(fp, True)
else:
fp = read_fn(json_file_path, 'rb')
return MockHTTPResponse(fp, False)
return _MockHTTPResponse(fp, False)

def _mock_urlopen(request):
url = request.get_full_url()
Expand Down Expand Up @@ -1209,6 +1204,47 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
assert 'animal' not in dataset['feature_names']


@pytest.mark.parametrize('as_frame', [True, False])
def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir):
if as_frame:
pytest.importorskip('pandas')

data_id = 2
_monkey_patch_webbased_functions(monkeypatch, data_id, True)

# create a temporary modified arff file
dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id))
original_data_path = os.path.join(dataset_dir,
'data-v1-download-1666876.arff.gz')
corrupt_copy = os.path.join(tmpdir, "test_invalid_checksum.arff")
with gzip.GzipFile(original_data_path, "rb") as orig_gzip, \
gzip.GzipFile(corrupt_copy, "wb") as modified_gzip:
data = bytearray(orig_gzip.read())
data[len(data)-1] = 37
modified_gzip.write(data)

# Requests are already mocked by monkey_patch_webbased_functions.
# We want to re-use that mock for all requests except file download,
# hence creating a thin mock over the original mock
mocked_openml_url = sklearn.datasets._openml.urlopen
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leave a comment that sklearn.datasets._openml.urlopen is already mocked by _monkey_patch_webbased_functions.


def swap_file_mock(request):
url = request.get_full_url()
if url.endswith('data/v1/download/1666876'):
return _MockHTTPResponse(open(corrupt_copy, "rb"), is_gzip=True)
else:
return mocked_openml_url(request)

monkeypatch.setattr(sklearn.datasets._openml, 'urlopen', swap_file_mock)

# validate failed checksum
with pytest.raises(ValueError) as exc:
sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
as_frame=as_frame)
# exception message should have file-path
assert exc.match("1666876")


def test_convert_arff_data_type():
pytest.importorskip('pandas')

Expand Down
0