From 5e9f6c94995aa7fdc4eaf1efeb5b73bcb73fde77 Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sat, 24 Aug 2019 16:56:45 -0400 Subject: [PATCH 01/25] add verify_checksum functionality with tests --- sklearn/datasets/openml.py | 49 ++++++++++++-- sklearn/datasets/tests/test_openml.py | 93 ++++++++++++++++++++------- 2 files changed, 112 insertions(+), 30 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index d0197537949d9..826e0b57977e4 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -1,6 +1,8 @@ import gzip import json import os +from io import BytesIO +import hashlib import shutil from os.path import join from warnings import warn @@ -61,7 +63,7 @@ def wrapper(): return decorator -def _open_openml_url(openml_path, data_home): +def _open_openml_url(openml_path, data_home, expected_md5_checksum=None): """ Returns a resource from OpenML.org. Caches it to data_home if required. @@ -86,10 +88,29 @@ def is_gzip(_fsrc): req = Request(_OPENML_PREFIX + openml_path) req.add_header('Accept-encoding', 'gzip') + def _md5_validated_stream(input_stream, md5_checksum): + """ + Consume binary stream to validate checksum, + return a new stream with same content + :param input_stream: Stream to read bytes from + :param md5_checksum: Expected md5 checksum + :return: Stream with the original content for consumption + """ + with closing(input_stream): + bytes_content = input_stream.read() + actual_md5_checksum = hashlib.md5(bytes_content).hexdigest() + if md5_checksum != actual_md5_checksum: + raise ValueError(f"md5checksum {actual_md5_checksum} \ + does not match {md5_checksum}") + return BytesIO(bytes_content) + if data_home is None: fsrc = urlopen(req) if is_gzip(fsrc): - return gzip.GzipFile(fileobj=fsrc, mode='rb') + fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb') + if expected_md5_checksum: + # validating checksum reads and consumes the stream + return _md5_validated_stream(fsrc, expected_md5_checksum) return fsrc local_path = _get_local_path(openml_path, data_home) @@ -102,6 +123,10 @@ def is_gzip(_fsrc): try: with closing(urlopen(req)) as fsrc: + if expected_md5_checksum: + if is_gzip(fsrc): + fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb') + fsrc = _md5_validated_stream(fsrc, expected_md5_checksum) if is_gzip(fsrc): with open(local_path, 'wb') as fdst: shutil.copyfileobj(fsrc, fdst) @@ -448,7 +473,8 @@ def _get_num_samples(data_qualities): return int(float(qualities.get('NumberOfInstances', default_n_samples))) -def _download_data_arff(file_id, sparse, data_home, encode_nominal=True): +def _download_data_arff(file_id, sparse, data_home, encode_nominal=True, + expected_md5_checksum=None): # Accesses an ARFF file on the OpenML server. Documentation: # https://www.openml.org/api_data_docs#!/data/get_download_id # encode_nominal argument is to ensure unit testing, do not alter in @@ -457,7 +483,8 @@ def _download_data_arff(file_id, sparse, data_home, encode_nominal=True): @_retry_with_clean_cache(url, data_home) def _arff_load(): - with closing(_open_openml_url(url, data_home)) as response: + with closing(_open_openml_url(url, data_home, expected_md5_checksum)) \ + as response: if sparse is True: return_type = _arff.COO else: @@ -515,7 +542,7 @@ def _valid_data_column_names(features_list, target_columns): def fetch_openml(name=None, version='active', data_id=None, data_home=None, target_column='default-target', cache=True, return_X_y=False, - as_frame=False): + as_frame=False, verify_checksum=True): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a @@ -576,6 +603,11 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas DataFrames or Series as describe above. + verify_checksum : boolean, default=True + If True, verifies md5_checksum of file provided in /download/{id} + If cache=True, verification only happens during data download + from network. + Returns ------- @@ -727,8 +759,11 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, shape = None # obtain the data - arff = _download_data_arff(data_description['file_id'], return_sparse, - data_home, encode_nominal=not as_frame) + arff = _download_data_arff(data_description['file_id'], + return_sparse, + data_home, + encode_nominal=not as_frame, + expected_md5_checksum=data_description["md5_checksum"] if verify_checksum else None) # noqa: E501 description = "{}\n\nDownloaded from openml.org.".format( data_description.pop('description')) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 43e8c97848958..8e7bb14bfc32f 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -8,6 +8,7 @@ import scipy.sparse import sklearn import pytest +import shutil from sklearn import config_context from sklearn.datasets import fetch_openml @@ -48,7 +49,8 @@ def decode_column(data_bunch, col_idx): # non-nominal attribute return data_bunch.data[:, col_idx] - data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None) + data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None, + verify_checksum=False) # also obtain decoded arff data_description = _get_data_description_by_id(data_id, None) @@ -79,18 +81,19 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, # result. Note that this function can be mocked (by invoking # _monkey_patch_webbased_functions before invoking this function) data_by_name_id = fetch_openml(name=data_name, version=data_version, - cache=False) + cache=False, verify_checksum=False) assert int(data_by_name_id.details['id']) == data_id # Please note that cache=False is crucial, as the monkey patched files are # not consistent with reality - fetch_openml(name=data_name, cache=False) + fetch_openml(name=data_name, cache=False, verify_checksum=False) # without specifying the version, there is no guarantee that the data id # will be the same # fetch with dataset id data_by_id = fetch_openml(data_id=data_id, cache=False, - target_column=target_column) + target_column=target_column, + verify_checksum=False) assert data_by_id.details['name'] == data_name assert data_by_id.data.shape == (expected_observations, expected_features) if isinstance(target_column, str): @@ -115,7 +118,8 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, if compare_default_target: # check whether the data by id and data by id target are equal - data_by_id_default = fetch_openml(data_id=data_id, cache=False) + data_by_id_default = fetch_openml(data_id=data_id, cache=False, + verify_checksum=False) if data_by_id.data.dtype == np.float64: np.testing.assert_allclose(data_by_id.data, data_by_id_default.data) @@ -137,7 +141,7 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, # test return_X_y option fetch_func = partial(fetch_openml, data_id=data_id, cache=False, - target_column=target_column) + target_column=target_column, verify_checksum=False) check_return_X_y(data_by_id, fetch_func) return data_by_id @@ -399,7 +403,8 @@ def test_fetch_openml_anneal_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, - target_column=target_column, cache=False) + target_column=target_column, cache=False, + verify_checksum=False) data = bunch.data target = bunch.target frame = bunch.frame @@ -482,7 +487,8 @@ def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch): msg = 'Could not adhere to working_memory config.' with pytest.warns(UserWarning, match=msg): with config_context(working_memory=1e-6): - fetch_openml(data_id=data_id, as_frame=True, cache=False) + fetch_openml(data_id=data_id, as_frame=True, cache=False, + verify_checksum=False) def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch): @@ -499,7 +505,7 @@ def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) X, y = fetch_openml(data_id=data_id, as_frame=True, cache=False, - return_X_y=True) + return_X_y=True, verify_checksum=False) assert isinstance(X, pd.DataFrame) assert X.shape == data_shape n_categories = len([dtype for dtype in X.dtypes @@ -528,7 +534,8 @@ def test_fetch_openml_adultcensus_pandas(monkeypatch): target_column = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, + verify_checksum=False) data = bunch.data target = bunch.target frame = bunch.frame @@ -566,7 +573,8 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): frame_n_floats = 77 _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, + verify_checksum=False) data = bunch.data target = bunch.target frame = bunch.frame @@ -606,7 +614,7 @@ def test_fetch_openml_emotions_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, - target_column=target_column) + target_column=target_column, verify_checksum=False) data = bunch.data target = bunch.target frame = bunch.frame @@ -997,7 +1005,8 @@ def _mock_urlopen_raise(request): monkeypatch, data_id, gzip_response) X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, - return_X_y=True) + return_X_y=True, + verify_checksum=False) monkeypatch.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen_raise) @@ -1030,12 +1039,13 @@ def test_fetch_openml_inactive(monkeypatch, gzip_response): _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) glas2 = assert_warns_message( UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, - data_id=data_id, cache=False) + data_id=data_id, cache=False, verify_checksum=False) # fetch inactive dataset by name and version assert glas2.data.shape == (163, 9) glas2_by_version = assert_warns_message( UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, - data_id=None, name="glass2", version=1, cache=False) + data_id=None, name="glass2", version=1, cache=False, + verify_checksum=False) assert int(glas2_by_version.details['id']) == data_id @@ -1046,7 +1056,8 @@ def test_fetch_nonexiting(monkeypatch, gzip_response): _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # Note that we only want to search by name (not data id) assert_raise_message(ValueError, "No active dataset glass2 found", - fetch_openml, name='glass2', cache=False) + fetch_openml, name='glass2', cache=False, + verify_checksum=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1058,7 +1069,8 @@ def test_raises_illegal_multitarget(monkeypatch, gzip_response): assert_raise_message(ValueError, "Can only handle homogeneous multi-target datasets,", fetch_openml, data_id=data_id, - target_column=targets, cache=False) + target_column=targets, cache=False, + verify_checksum=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1071,20 +1083,20 @@ def test_warn_ignore_attribute(monkeypatch, gzip_response): assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), fetch_openml, data_id=data_id, target_column='MouseID', - cache=False) + cache=False, verify_checksum=False) assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'), fetch_openml, data_id=data_id, target_column='Genotype', - cache=False) + cache=False, verify_checksum=False) # multi column test assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), fetch_openml, data_id=data_id, target_column=['MouseID', 'class'], - cache=False) + cache=False, verify_checksum=False) assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'), fetch_openml, data_id=data_id, target_column=['Genotype', 'class'], - cache=False) + cache=False, verify_checksum=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1095,7 +1107,8 @@ def test_string_attribute_without_dataframe(monkeypatch, gzip_response): assert_raise_message(ValueError, ('STRING attributes are not supported for ' 'array representation. Try as_frame=True'), - fetch_openml, data_id=data_id, cache=False) + fetch_openml, data_id=data_id, cache=False, + verify_checksum=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1106,7 +1119,7 @@ def test_dataset_with_openml_error(monkeypatch, gzip_response): UserWarning, "OpenML registered a problem with the dataset. It might be unusable. " "Error:", - fetch_openml, data_id=data_id, cache=False + fetch_openml, data_id=data_id, cache=False, verify_checksum=False ) @@ -1173,3 +1186,37 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response): # so we assert that we don't have the ignored feature in the final Bunch assert dataset['data'].shape == (101, 16) assert 'animal' not in dataset['feature_names'] + + +def test_fetch_openml_verify_checksum(monkeypatch): + data_id = 2 + _monkey_patch_webbased_functions(monkeypatch, data_id, True) + + # modify a mocked file content to change checksum + dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id)) + original_data_path = os.path.join(dataset_dir, + 'data-v1-download-1666876.arff.gz') + backup_data_path = original_data_path + ".back" + shutil.copy(original_data_path, backup_data_path) + + try: + with gzip.GzipFile(backup_data_path, "rb") as orig_gzip, \ + gzip.GzipFile(original_data_path, "wb") as modified_gzip: + data = bytearray(orig_gzip.read()) + data[1] = 37 + modified_gzip.write(data) + + # succeeds if checksum validation is explicitly overriden + dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False, + verify_checksum=False) + assert dataset + + # fails with checksum validation (default) + with pytest.raises(ValueError) as exc: + sklearn.datasets.fetch_openml(data_id=data_id, cache=False, + verify_checksum=True) + + assert exc + finally: + shutil.copy(backup_data_path, original_data_path) + os.remove(backup_data_path) From 5df30c063585ab84d8bf89d7795483e685e1b8ef Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sat, 24 Aug 2019 17:28:58 -0400 Subject: [PATCH 02/25] python3.5 compatible multi-line string --- sklearn/datasets/openml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 826e0b57977e4..d5e02728c845c 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -100,8 +100,8 @@ def _md5_validated_stream(input_stream, md5_checksum): bytes_content = input_stream.read() actual_md5_checksum = hashlib.md5(bytes_content).hexdigest() if md5_checksum != actual_md5_checksum: - raise ValueError(f"md5checksum {actual_md5_checksum} \ - does not match {md5_checksum}") + raise ValueError(f"md5checksum {actual_md5_checksum} " + "does not match {md5_checksum}") return BytesIO(bytes_content) if data_home is None: From df4c0493c4f1bb89c373035d0311eba908db5223 Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sat, 24 Aug 2019 22:09:56 -0400 Subject: [PATCH 03/25] use titanic local file and format string --- sklearn/datasets/openml.py | 32 ++++++++++++++++----------- sklearn/datasets/tests/test_openml.py | 10 ++++++--- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index d5e02728c845c..a56d1d00e6072 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -92,16 +92,26 @@ def _md5_validated_stream(input_stream, md5_checksum): """ Consume binary stream to validate checksum, return a new stream with same content - :param input_stream: Stream to read bytes from - :param md5_checksum: Expected md5 checksum - :return: Stream with the original content for consumption + + Parameters + ---------- + input_stream : io.BufferedIOBase + Input stream with a read() method to get content in bytes + + md5_checksum: str + Expected md5 checksum + + Returns + ------- + BytesIO stream with the same content as input_stream for consumption """ with closing(input_stream): bytes_content = input_stream.read() actual_md5_checksum = hashlib.md5(bytes_content).hexdigest() if md5_checksum != actual_md5_checksum: - raise ValueError(f"md5checksum {actual_md5_checksum} " - "does not match {md5_checksum}") + raise ValueError("md5checksum: {} does not match expected: " + "{}".format(actual_md5_checksum, + md5_checksum)) return BytesIO(bytes_content) if data_home is None: @@ -123,16 +133,12 @@ def _md5_validated_stream(input_stream, md5_checksum): try: with closing(urlopen(req)) as fsrc: + if is_gzip(fsrc): # unzip it for checksum validation + fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb') if expected_md5_checksum: - if is_gzip(fsrc): - fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb') fsrc = _md5_validated_stream(fsrc, expected_md5_checksum) - if is_gzip(fsrc): - with open(local_path, 'wb') as fdst: - shutil.copyfileobj(fsrc, fdst) - else: - with gzip.GzipFile(local_path, 'wb') as fdst: - shutil.copyfileobj(fsrc, fdst) + with gzip.GzipFile(local_path, 'wb') as fdst: + shutil.copyfileobj(fsrc, fdst) except Exception: if os.path.exists(local_path): os.unlink(local_path) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 8e7bb14bfc32f..14f236929bb51 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1189,13 +1189,15 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response): def test_fetch_openml_verify_checksum(monkeypatch): - data_id = 2 + # locally, titanic dataset is not trimmed, hence a valid + # candidate for a checksum test failure + data_id = 40945 _monkey_patch_webbased_functions(monkeypatch, data_id, True) # modify a mocked file content to change checksum dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id)) original_data_path = os.path.join(dataset_dir, - 'data-v1-download-1666876.arff.gz') + 'data-v1-download-16826755.arff.gz') backup_data_path = original_data_path + ".back" shutil.copy(original_data_path, backup_data_path) @@ -1203,17 +1205,19 @@ def test_fetch_openml_verify_checksum(monkeypatch): with gzip.GzipFile(backup_data_path, "rb") as orig_gzip, \ gzip.GzipFile(original_data_path, "wb") as modified_gzip: data = bytearray(orig_gzip.read()) - data[1] = 37 + data[len(data)-1] = 37 modified_gzip.write(data) # succeeds if checksum validation is explicitly overriden dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False, + as_frame=True, verify_checksum=False) assert dataset # fails with checksum validation (default) with pytest.raises(ValueError) as exc: sklearn.datasets.fetch_openml(data_id=data_id, cache=False, + as_frame=True, verify_checksum=True) assert exc From f4ca32b5d824330edbca3a3706d0835f9a180075 Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sun, 25 Aug 2019 18:55:15 -0400 Subject: [PATCH 04/25] update locally truncated arff md5sums --- .../openml/1119/api-v1-json-data-1119.json.gz | Bin 707 -> 711 bytes .../data/openml/2/api-v1-json-data-2.json.gz | Bin 1362 -> 1363 bytes .../openml/292/api-v1-json-data-292.json.gz | Bin 547 -> 547 bytes .../40589/api-v1-json-data-40589.json.gz | Bin 596 -> 598 bytes .../40675/api-v1-json-data-40675.json.gz | Bin 323 -> 323 bytes .../40966/api-v1-json-data-40966.json.gz | Bin 1659 -> 1660 bytes sklearn/datasets/tests/test_openml.py | 7 ++----- 7 files changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-1119.json.gz b/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-1119.json.gz index 97ab0d1ce64f686215c763732e3c303fcfae22db..286e57672339a1ad99633afd2ea918d62c8c13b3 100644 GIT binary patch delta 694 zcmV;n0!jVD1;+&lABzYGDi32_kq94uO-~~+488BKu;v7#e$14%v~vS0v}hNE3Y-{J zrHQAFcAAV5PrD<;f5!>kT{!K{cI;>S<@q&Hg2c)rtrU9cbVJ&jq4R;Z#E|@a`*XQ`IoT35ML8^o;s+vl3A;pbrQE@d2fO<$Z1pp)(?^E^4^4rb35CB7|v-J=CZ+2n+3{WzM?D z!GdXrTWuJw2l;$H4f~WPg!wa|a&{M61UkTBW4$Kpx&r>gOaMvKJ4;w_z4oAvh-_+w zhBT7`WFz?T2f;HmD$oX=u5Nm;Ziz>)I}XUY<&TZYQI?PUdDhR)AG53&=fxywwAM0k&hPi_nw?PQYIegEK8|rX90p%T;fCdKpR;8T zCbTGrlW~@fnA}fN9|%`}RbeV!<0_s-Op2PX1m~?txD8*BAaa_?3q!~tDo%0sz%^f< zZWbITu}O+&MsO@d#erQ%+jZ-S8D52Z^3*kvY-?-Lk+L_aA|?NZ=Wa@?tbE&cW1-GwqseTRRnxIJ c$7*`6GC5yov+;}xd3*cz4je#EYCZ%20P|B&1poj5 delta 690 zcmV;j0!{tL1;YghABzYG4$-??kq94uQBUJA4E`%tpP<^-c3Xk<4VciOI}jT1MA4Kc z?iO8_REfJ?h4}C1l-(UX-J9*$pY1Q7-y$VQtUc0Np_fj#q^*gHw@9m~h_WnOM5D-v z9kM4>*U(s^@tx=B9-SYOql~>TV|w^*w_>kB7zJ0txgM^6Q2s;* zP?|V?w4IX}l3(tAtX8ilTcWlolQI#%teHqZsrqh$sw?s|P0QpI;G^)NLE#akCOoz} zHX1=0JxbANvw`OpXp3+uW#@$K!Pamu^d84oqDK$cjqqOAn%5~nz!IendyStQB7EyB zr&Sq^;K5mms2I=(gn;;n((*okw}#Le5mpxsTw7D4Ln9HwG{znpR2zh)_R=zE{lLMJ zX@@&)7;XgFVsRGsDNP9TXF%=jZfFtc0Eeyhnyl*!_zyDyB#!SbVZrshx&o}p2JHt>9XJA$es9=+)~Ae)Xqc1R{ChH(eP5g1D!Ve~YAgeFV}!pk5M zIGLV)`b68J*)Pdh;@qs!IJ4f|c&SfSRO z5M($_v-vnn$LaY~nilh{$b&{3Ed%HL{?JwIgfiE#8>aBX7>C1Q@?{imSSI_7EqgGb zMVaLDG@UZJpQJt!uCBv>RJy_Sa27+-p!r&G-kOBl@C6AXr7tm;!vHE@%Pk+J?{=JYb}`E#rz%&Xx~ONEXEn}c Ybv`>2xu{VJ6}`Xz1{munjXeYa0PfdIEdT%j diff --git a/sklearn/datasets/tests/data/openml/2/api-v1-json-data-2.json.gz b/sklearn/datasets/tests/data/openml/2/api-v1-json-data-2.json.gz index 22dfb6ff61c1b2693bfe040418e754d7f586239e..921326973b21228f939cf767d06fffd1aae106b0 100644 GIT binary patch literal 1363 zcmV-Z1+4lXiwFp13S(UW17UD!Ep{<2YIARHEo5PIVJ$K)YIARH0F_qXQ` zIwvRRrOUKAIhnw{dQ^I)LU^qebv%K14X^1^BrtW{NTC)$RRW(4UBgAGoKPHKrcO?# zx-^V?im&&VH{V`n&J}ht>(08=uuLp@D3}dPCPJQ+vu-YDT|X(vy3HSeqaRBv`n-^LNwhl9w%VSSDQ+F8$ZNx$-Dq-UmnouA$ItO$Uf(P>N zlsU*bRUV^LaM+o0>Po|Hr2~`HT5Jmol9~mVYbc)-+^`c63RxVLvNVfzH84kUjiAzk zWr8cliRYk@Av-vO@0=@aW%x-YVr}KT$ktXc4Bc~Bjie$+_blUi)m<1}aN`8_VYqbT z2Aswqn5py>&O%Z$C!}z7GzO((wIQP>QQTUBV;}vr1uCEohm|&uG?qMs8;4v=EtlxU z$6IZR9al8g%bLz`X4ckEUHn_E2{@i(h?R!khfogxYYJQ9!l8>@j1Lg!u7GLn3EDs^ca&}q0?fyrLbC~Op(8!f#g6Yuq|v7Uu~L?S8$HZe^x zroS4*E7Zel6jwW7(jX$zCj&AhrwwZC=VrKCxNL7IQ7;6gRX`WK4x^^}&{$D;Tbh(I z9;CR)))U>LEJ0)MZ|(n-v<@a9ugM#7MH-1Cl$iT#O}4`QE!YUfjahpY@cFXwgQ~{q zUMtEOyS-MFGhX4n!rO?8)+A8PSneH;D(1f!x7I$&$1=Qg+K9`I73C{3ZFh#>ZmcLC z!`+m$&0A)aEvIHFp0}c0{CnZ#^H!8A<;Ljets}p`jM<55YxQ!;lFzy^-m;mSb(hpA zypr)MZ*TkyJS3mUcB&|%Ed6$yJV%Nm{#_cK=FeFND5Bjf@xY6?-|BZ@MvI-rz5Mg$ z4+7J-Z{9RVYJg(8Ot8l`>er(%KuuR7X&sj5Kq#%`YB!$WK&Y@5zig?1e0e%n1Fx~( zkqV%P59yuy*T}!lM!g1&(iW6GR_C)hIJKooUK^UtnIiWdP=kV?N`*(k5iBV#EG8*T z2ACJ|ykfRHQrgXFQDC0+H}?jFmv_dE&tIy!P5_Q_VJ(I+uQ5mPvd&_7Eh>n4o%XQg zpqNbg0y9Y>uukEQZy0_j_h6gpl}{i(LDJazY+7f(r}6c;h$0XpYxAbS9M=m zNKKPDmH}1y)bB;Zpof2>yZ&GjjVJwKjl*erdXKp+WBh7{8|k~YrO9fw3NaI_T!z{# z>QV=fLWC`>4ZN>eHyVvbug*qT$5e$vNKKkH5emuYbzRqhp6a%gmeS3gFEvp1r<~`3 z)ne`xd~>`pxonkvjxO-)Ysb1WS+*bJ=!w&wpYD=G#c_#9<%X?r)*65uz%W*;uL>| VZdbkIub1`ndPU z{$3}gj!rFi)0A6gMB#*1oe})l-`nem6kDBshxiXgbM8N>Qk=@VZ^4bN@tvsd*=Fe+ z9i5jh)8^=C1P|&->7@$cjaJn06p|IZp$n11*l{C;ngdlSd^U6i7o~DSae#?BIvVTJ zFzzY7K3v{@d!0E~*wLgr=~BZovEZR#HY}M4d0I}oxtw(UtZ-6x(mk)jMe;)kAUZqi z)tq;dBLJ9i|Bo04P(Qh8p`~D}Fmt&)^pq2IpK;Je%!Q&7)^4E*1!hL)fNnzYK>nRF z2RWz8WAqFhcFs6;rQx>Hfk|pDwuJ>r&78{>l+OyT*$D`RERIT9n)#|4n4`ExP-($3 z!4>1gb5O{T9h|{;&K0&Y{G<}GwsKx%D=Qd=?m4VRQjw#3mhrsmE{ra?aRU1=T)K7x zPGbCA|iba3%>A_{TjBl|Y=q(_ti1~Od|CTJRg>9H zE6SO;omP}HS>nCI+lY(SBv8#n?i`LP=D!#>);`L|GQ4xzh|9GVWFQPr3=-vYDH77t|=c zlJP2UZ~O~9B%jD;swko?{dOC@M2aH*T^c>hU$PERM7vYsffsSN)$hQJ7F&yZ`RDB) z1jaXS-!?~TfMU8#u%|Za*P}2%O_w5V9hR3sD6QmbJD%S_sIV5lY^i{Jc|KMHud&{e z3ZRD%=^gu5>|ZBwuR)`<1!Ygw`D_kOZE2F%nr3~b$b$#epdhGH;X!Z!3yKSiNeYtz z=0!ZOnC%Xfc2imum}mXXy$0dsopIyymujvPfP-9Ei(yP_%mKWvvlw2B3KCwYJuEpW zCR0AgOp*$$Q@G=6hTq9O*k*d^6NpccG_gLL*4giQd_C?ktgD|oJ@~@s{NvSC-B%V; z({zeuKvh2Vd(klH;V-`L4@S}1s6VW6I89IQF?VHxU(IkMeb=TmSuU3$W@44gP@8#O z>flj`u!XgO_ciH8aU7qX#8}5vg+fTpY}Q05B%juGT?2Zq+h(+sZt8rgfwDj6JP)iE zbF1K+;*H5=tL$@hj$dCp)|KgMik9B$f!cKc-09=7l&y;eK+Z7jb!-)iT=iu$7V zo{N@9SOvJ!R@Eui9t&Rg=IQYi%NO=!%iPkpHd^-j-{UOYUB Uqt5=`&;6hO0e4b}%oPg&0G9@#9smFU diff --git a/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz b/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz index 888140f92b36025bdaf00eba9eeb917fd9c3964e..3313b6fb4a0a26fd6c4561e80f51b93b4b7390b3 100644 GIT binary patch literal 547 zcmV+;0^I!{iwFn<4r5&a17u-zVP9lrb7OL8aCB*JZZ2wbZ*BlpQrn8#Fc5uCzCyTf zVH4Y0JB|I8mxV$LOS4@lbunXEj>XE7(Ip{W_TMX6X`O&EXJ(G(a%SF_i>L(ywWg+S zsV0hzhDi~X=zY0ZM67iv!Uc{VyeVLy-m$mZDBui4@SmvC;Z>9d&wZ|Fwc1KKLT8-xJke|(s_J9;MiqR8oZ?kqv4%0&(I?*okrV?KaI)g?+s=;OXL)(on z6gvFhKC0rILY4T0WQPa`t*I^%zB+vU5^am*YIhWjRi2S%L8#7GF|NZGH zgotu%sixSyX4kmnoDwf_?dnX6nTHD$J8}u_eK^^z)BGjfmPu0LUEpy>s2~&P**4C) zK3CkkdSgaiR&f=d&*zOCsTlZ1D%XymuZmBpLvCljRuyN(4&y4G#o6dE8K|zi4$U~N z!);CMe>c;C<)${Ej_w@6xRo_FbpA6?3>=;HD;h4QZ(IQ;jZ zGD9^VEmtzB!25EsNOV&heyg9yn0|3i3Ju0Uu3*jH~RjP{wD}ZuXtRP&$B66MtBZdLW?L-VtET4OB zfPc1Upd*8F2F&U}=*_qG05r3jvomXNvvyhy(?=1N7@v8K|nF^q*Bh;_JcAsDEjuGp0V89{{stZ+Yn9*@0+OYcY{qZq| zsP=57r`Y|>u5l#YJYJcG1j$jcd(o>%4fVO zW?bCxOElxp$?tg9-0+4AZr_50@n$f_@57nkZ5i<^IQs#&F*LMgO?yUPjT-`?fn~Wb lXonao_C?u1RKVSKvukKcQMrN1a`CbJ_zTS2bE$~~005Kz4Z;8b diff --git a/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-40589.json.gz b/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-40589.json.gz index 9c71553ce5137c44e095e6a40f9c3817f0130d31..97089490a18c9e01f6a30a0936dc49cea9bf18cb 100644 GIT binary patch literal 598 zcmV-c0;&BUiwFpq4`W>b17u-zVP9lrb7OL8aCB*JZZ2wbZ*BlplgVz}FbsyzGfyG- z)BY<cF5D+d^c^mE=sPLEgPu?4rFG5=H(bKhp2pO~SxJW4LV? zH&lzs3NJqV{wZy00+SAl0J7`3?=U5fmh9h2#cZh3!#=sGN=EnlrQnI){+J zAQ%}smS}R1f?A~wMqhyT$f&VH6h=gNbalZ&O^qBWjRvK;&uM$UO1;AXzd1|ItS2oD zXFJrxG~LokatK$=CHxX&m%0jv9r`&7Zc}50lsjaArP7JZ+s+l$>01h2r@1lWjep=s zs#SUb>ml7gKRiU+JV^ztacBbUGB1v4UZzF$y(})p`BGNVPYTLM8hpOa#NC!~ExoC? zb~2YbtF!C%iq*u&L1Lx5I`hoy?5%Jobsc@KI@?$KeQ_%5Y@GuJ-L^}O4t%rhLUdcF z+=82IV?lRrfxucV$Q+A~;INq&ZUGx!v8~i_pcbIv2+c9W3$uU>M`Vsz%t?iXb*+4d zp+R8=q02l!;c&tuw{R}8ik!fK kmyDdCE%z-{6?OU~KM-;{lVrR3*nVvP00T^uffIz;U zD~HkJ#JF6Z=+h$k8Bup~V-1om+MOQIFd0sXKrI(w(1qI`?GY7|v7mvWInf1#0tS(h zp+kw3b8>2>HW+;&v`0pc9in7J4a88F9Mt3pNNO}l4Zf!BVUfC#f&2!Rnn^3o4X_RB ziBi{82_WIAL6Bd3>{8d^ut7g34mLHG3$a0luv9p4aoxD4I((f)TO-e|@YKJDkF7x7$=4D#!zn8_iIGxLVj1!!~NJGrmsd2A4T?=pOtySh+ zWmR^$T(DFy4gyQvRhegAWgmsR$;;?#RoQmG-4@5P%9c4`qT6=v(Sfh$U5H`Jlw0B^ zTU(+#w?M>N&6_DU9l>EU&D;VuykJ|X;Xo}w!x5TehG%908IH&tu~?IGOP00rKMWt_ zX5hNa^CJ#7Jc6wmiDtrF+^8D(Zb$Qrf#pX|0(m9PEp6b17u-zVP9lrb7OL8aCB*JZZ2wbZ*Blxkjrj^Fc3x8@D-KU z1#I(3!>*frK-DD@8f-(XnuqKmsEYFM#W7v9yYZPjo_psWM??Zb>lC(9IbrpJdY(xA za*v~kXt`IUpw+1)uO3V=Dvt&O&hfx*%DON^SRj9YI>gB`nlAMF*^Yp|_5N|W1hy__ z2HEazo(ni-r-T*>tv(sASzecU&>1b%B%YdO?e^C(ZjDD*#KqN3)@0kZrSqbu(WJ8- zn#{*+vK!m0d7J#MCM(WW$xGT~!C?xv?+=ICsP;%7M8C;;SPixacGyn_h*o!Nuq^y1 z$4Ru)oi-Z3|NC?ZxH?az0)hIr=ar!~%CZ$#CBED`dI8XLnL~FLZ?s~SR7~a_6+Gtx V3Qk2q;^+}S;vb?Mo>IgC003^+n{ogE literal 323 zcmV-J0lfYniwFovwM$z717u-zVP9lrb7OL8aCB*JZZ2wbZ*Blxkh^ZfFbqX!`zs2~ zO)Oh}Al|y<1B#9Y3|fywi2Q&=$0&mQd!@)7+KoEABJbrRO^5)5-bw6*blmC%^*oXG z?U5!4(ZW}xq;*q~_W&jsWk44L&J~f}rFHQS$s_qYX%`=s(G0P#W;+6g*2m}h9ND^< z8AR{hycQHRZw|CP(E2Mc+MKl&i#nsZocK$#tb>2;->nJginzG8%erjaw&T3WX*9># zUb-yA?6Q0At9hHkt}ZK2byZZf%c8?0SfzG{+Q@!SA4R{(6s!h&1bgu(14OHbHCpCj z$bJ&-^q`H#@Bcm>0N Vsv5-|E*n{T>W5rWyPyW(C(QfD2~lJmXx#+YsI`oeIe{9_!C5^VzB? z9S?*u03*4-KU9TIIEi`x`{4XTeQxc-^hO7x1E!O?xaXlrOjsl$0p;FmRr7X~lE}rE&uUEv!gb23El(XIc`Ekig(d6{!76r91<) z{!}d`3|BJMD(4|wU#cOz`|UNvQT=$Y2Rw?K-MD!ij^s#gOoEKx^URQdb{iIJtvOnT zpiR^!0k`arqfV8@8BrP(!fP^b>lzX!VZwbzq|ONp3G-zD-R~*o0AjV#KGua1EFO5ulD4m&&ddX!wNB*}b4JqGPN^ zLP8coKfxSHse&=TH{?#s2DWqmmN%d9t%V0*9t#XlI9_0HG*;wA#;mU)S1?d7i>_hj zD!JQps1yaqct*qlb$8b&DLoD53#vtDZ>rHH;B<;rv97dK7c=yK&aiPwPCS|?3LCwH ziv(;UGGSL6I8;K_M0w5-AwQuOG-E03e52_Z>^^=2AGIESeEj$k`!ghs!LWe`)r@|| zLBSJchGk?*z3W)^;89rq6T&vGsZ;822^5OI;;( zrFbvzxiA)H}bmTi*L@+$0f&`UW@PdrF9FhEveEApcJmPk8?)pTMP%cOT#rZs)j8g$pHl zSQ5B&?$2|K|J79hr*kH8)ZXD6Ktlf}fL}85c|U+#g-C{IpE9`p^#}NKNSmVqWWyx_ z<*scmt%1`OW-u=N&@Z>iba}y+`Kc$sEgCn?3c>r@x^!y+)EqS-Xyw&qV=#z2L}hG|uEz0UYxB|Fy6ARG_IqK3-$e-4V#}A9N479@N{!~8 z`_nvajgvV~?o5$8kHp6@OD2=3&5x#s>=3&dKZ&ez?qyMF_Y)4Y8y G4*&ofDk8)H literal 1659 zcmV->288(^iwFp1x=ULC17u-zVP9lrb7OL8aCB*JZZ2wbZ*Bn9S6OcxHxPcdf5kun z0@+!uwUTxENtNT&vE3M!+oFg9h1#Vg#=E3IawRK4{(EQ0)#bQIlRovqu$G+Tn|t`> z@Sv11%SMLVQNm5E#oP)dOI`SKcyLe>3H~abRy)mRDWC(%GEPS~BIdVRSuUi%Sa5AT zW9ihN?29WMANO)QQ~LP03;mR72EU4F&NU0*irE=YIhXJ@gf~nlLUN7A`t!3Aj6MD;ICdk&NVj#`v@vft40m*DsEGb^biZ{^b|m$A8gRU)!V>hx_S#`Vm*QZPbZak zddj5uEaVg}IJ3FtaKq*XE)y=Tn5>{sZlJG)6){V}D!ArMOX3j{7~H5Fwcn_Or-0U< ztEGg&N+w!mJcQe8HGp@&y@se-ZXR`kSEE`dsx`x*9LlXpkn#mj4GCzqV7}Ixp=Agv zAx-FYh`HutjVNNJEq^5PHpD42hCAL^Nvl-~OeP+PDS$wRt_1!#R|ch_4MfQwElkiB z`bY$Y9*qjnB=r9S!h)kZ(djU>Hw^fA#jp zGtLj>p86?REsVkge^}mg*iirzrO6k{G8foZ&C!K$Dc7cPEiz0L696V-5i559OH!@* z6n()de#p;RJX_lh$O?vK^OR%GQvwxmQ78ABcL4{aaAV73)Cd468m;P{(!kk8ynBZ2 zF`sUjqkU$lABxg*${8i#j{Q-!z0cx|C=3eWHJP__4Kb52<~}15=LClGBt|abb1tdE zp1NS2PFzf9b~mqm)X)Imrp732%qTxG;uBxE2273!P{))@Wmj`Fe9UKTAt;RK7^{(x zkcH5TF-H=rV9f6=xzn~e>}m^#N@zb(o-;(qPpAdWSPDDaYI+8{k6+(Mt&1NYKYqmi3~{ADsNlhVMnB`A z;E6KBQZl97aV$r$5tjXgu&ry#l=?dY2mNo+Q7C>IAqxBk;lC94Pbh8SLm#l!kNdFo zR9N;C!gkPg_T+gN?#ufRF0_RUe@fu9AYCf@=E6h%T~bxJVeB*Ar~Qi;FQ8`;@Hn>|M+COv%+0bc-8%JAu_|1iBS$R^TW$D* zrMd5L-y4)?e%#?gw-1vLPE?wzCH2G=ykxP^@!3nBYjyTg@l;IDMsn|Tm{+SE>MDsF zx2`>)ttfxx=hoA$$<*TIbNsz)EM?)nX*SN~4_ICIR!5!FP89lA zj;~63;Pmb;F8oxca!a5&~{jqNq}B-$%`Ewc3puo-V~!B4bB$n~(9UVAm-f zZQnJ^%e<4=VH9U!0om#tDtNJHS;hm7_$)v!E-Wd7k+?sU# zOq;MgwIj>)6d!?X-EoN0^&?NF)e_nFubtyA&piQd(YR^u5xg(0OScq2X;Yl^u`ayQ z2ghvL)X~2~D{rnV{eILYDr56xHHzwu?MG+lqSGnZ7s3d?ix8~&jxRBfY;Ne38qI?H z(=2I>;u()0OrFtQJw9!9ShL-3j3=ymT5oq+QN7)+wd*X3dA(LTJotL}^>27iFrF Date: Mon, 9 Sep 2019 20:12:39 -0400 Subject: [PATCH 05/25] return bytes instead of stream, read once --- sklearn/datasets/openml.py | 70 +++++++++++++-------------- sklearn/datasets/tests/test_openml.py | 10 ++-- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index a56d1d00e6072..721d879d1e1f0 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -1,9 +1,7 @@ import gzip import json import os -from io import BytesIO import hashlib -import shutil from os.path import join from warnings import warn from contextlib import closing @@ -63,7 +61,7 @@ def wrapper(): return decorator -def _open_openml_url(openml_path, data_home, expected_md5_checksum=None): +def _openml_url_bytes(openml_path, data_home, expected_md5_checksum=None): """ Returns a resource from OpenML.org. Caches it to data_home if required. @@ -79,8 +77,8 @@ def _open_openml_url(openml_path, data_home, expected_md5_checksum=None): Returns ------- - result : stream - A stream to the OpenML resource + result : bytes + Byte content of resource """ def is_gzip(_fsrc): return _fsrc.info().get('Content-Encoding', '') == 'gzip' @@ -88,40 +86,38 @@ def is_gzip(_fsrc): req = Request(_OPENML_PREFIX + openml_path) req.add_header('Accept-encoding', 'gzip') - def _md5_validated_stream(input_stream, md5_checksum): + def _md5_validated_bytes(bytes_content, md5_checksum): """ Consume binary stream to validate checksum, return a new stream with same content Parameters ---------- - input_stream : io.BufferedIOBase - Input stream with a read() method to get content in bytes + bytes_content : bytes md5_checksum: str - Expected md5 checksum + Expected md5 checksum of bytes Returns ------- - BytesIO stream with the same content as input_stream for consumption + bytes """ - with closing(input_stream): - bytes_content = input_stream.read() - actual_md5_checksum = hashlib.md5(bytes_content).hexdigest() - if md5_checksum != actual_md5_checksum: - raise ValueError("md5checksum: {} does not match expected: " - "{}".format(actual_md5_checksum, - md5_checksum)) - return BytesIO(bytes_content) + actual_md5_checksum = hashlib.md5(bytes_content).hexdigest() + if md5_checksum != actual_md5_checksum: + raise ValueError("md5checksum: {} does not match expected: " + "{}".format(actual_md5_checksum, + md5_checksum)) + return bytes_content if data_home is None: fsrc = urlopen(req) if is_gzip(fsrc): fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb') + bytes_content = fsrc.read() if expected_md5_checksum: # validating checksum reads and consumes the stream - return _md5_validated_stream(fsrc, expected_md5_checksum) - return fsrc + return _md5_validated_bytes(bytes_content, expected_md5_checksum) + return bytes_content local_path = _get_local_path(openml_path, data_home) if not os.path.exists(local_path): @@ -135,18 +131,23 @@ def _md5_validated_stream(input_stream, md5_checksum): with closing(urlopen(req)) as fsrc: if is_gzip(fsrc): # unzip it for checksum validation fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb') + bytes_content = fsrc.read() if expected_md5_checksum: - fsrc = _md5_validated_stream(fsrc, expected_md5_checksum) + bytes_content = _md5_validated_bytes(bytes_content, + expected_md5_checksum) with gzip.GzipFile(local_path, 'wb') as fdst: - shutil.copyfileobj(fsrc, fdst) + fdst.write(bytes_content) except Exception: if os.path.exists(local_path): os.unlink(local_path) raise + else: + with gzip.GzipFile(local_path, "rb") as gzip_file: + bytes_content = gzip_file.read() # XXX: First time, decompression will not be necessary (by using fsrc), but # it will happen nonetheless - return gzip.GzipFile(local_path, 'rb') + return bytes_content def _get_json_content_from_openml_api(url, error_message, raise_if_error, @@ -183,8 +184,7 @@ def _get_json_content_from_openml_api(url, error_message, raise_if_error, @_retry_with_clean_cache(url, data_home) def _load_json(): - with closing(_open_openml_url(url, data_home)) as response: - return json.loads(response.read().decode("utf-8")) + return json.loads(_openml_url_bytes(url, data_home).decode("utf-8")) try: return _load_json() @@ -489,16 +489,16 @@ def _download_data_arff(file_id, sparse, data_home, encode_nominal=True, @_retry_with_clean_cache(url, data_home) def _arff_load(): - with closing(_open_openml_url(url, data_home, expected_md5_checksum)) \ - as response: - if sparse is True: - return_type = _arff.COO - else: - return_type = _arff.DENSE_GEN - - arff_file = _arff.loads(response.read().decode('utf-8'), - encode_nominal=encode_nominal, - return_type=return_type) + bytes_content = _openml_url_bytes(url, data_home, + expected_md5_checksum) + if sparse is True: + return_type = _arff.COO + else: + return_type = _arff.DENSE_GEN + + arff_file = _arff.loads(bytes_content.decode('utf-8'), + encode_nominal=encode_nominal, + return_type=return_type) return arff_file return _arff_load() diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 8ce2e2839fe6a..53cc2df71d240 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -12,7 +12,7 @@ from sklearn import config_context from sklearn.datasets import fetch_openml -from sklearn.datasets.openml import (_open_openml_url, +from sklearn.datasets.openml import (_openml_url_bytes, _get_data_description_by_id, _download_data_arff, _get_local_path, @@ -922,13 +922,13 @@ def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir): openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) # first fill the cache - response1 = _open_openml_url(openml_path, cache_directory) + response1 = _openml_url_bytes(openml_path, cache_directory) # assert file exists location = _get_local_path(openml_path, cache_directory) assert os.path.isfile(location) # redownload, to utilize cache - response2 = _open_openml_url(openml_path, cache_directory) - assert response1.read() == response2.read() + response2 = _openml_url_bytes(openml_path, cache_directory) + assert response1 == response2 @pytest.mark.parametrize('gzip_response', [True, False]) @@ -949,7 +949,7 @@ def _mock_urlopen(request): monkeypatch.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen) with pytest.raises(ValueError, match="Invalid request"): - _open_openml_url(openml_path, cache_directory) + _openml_url_bytes(openml_path, cache_directory) assert not os.path.exists(location) From 10ecf9a89528c39fe47a9a62d59d4fc4f99d34bc Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sun, 29 Sep 2019 21:54:50 -0400 Subject: [PATCH 06/25] read and update md5 in chunks --- sklearn/datasets/openml.py | 62 +++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 721d879d1e1f0..612d769db94dc 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -86,38 +86,58 @@ def is_gzip(_fsrc): req = Request(_OPENML_PREFIX + openml_path) req.add_header('Accept-encoding', 'gzip') - def _md5_validated_bytes(bytes_content, md5_checksum): + def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512): """ - Consume binary stream to validate checksum, - return a new stream with same content + Takes in a byte-stream, reads in chunks and returns bytes. + If expected_md5 is not none, keeps md5 checksum state while streaming + and validates post stream consumption. Parameters ---------- - bytes_content : bytes + fsrc : io.BufferedIOBase + input stream to read bytes from - md5_checksum: str - Expected md5 checksum of bytes + expected_md5 : str + expected md5-checksum value + + chunk_size : int + size of chunks to read at a time from stream Returns ------- - bytes + fsrc_bytes : bytes + equivalent to fsrc_bytes.read() but with md5 validation if + expected_md5 is provided + + Raises + ------ + ValueError : + if expected_md5 does not match actual md5-checksum of stream """ - actual_md5_checksum = hashlib.md5(bytes_content).hexdigest() - if md5_checksum != actual_md5_checksum: + fsrc_bytes = bytes() + file_md5 = hashlib.md5() if expected_md5_checksum else None + while True: + data = fsrc.read(chunk_size) + if not data: + break + if expected_md5: + file_md5.update(data) + fsrc_bytes += data + + if expected_md5 and file_md5.hexdigest() != expected_md5: raise ValueError("md5checksum: {} does not match expected: " - "{}".format(actual_md5_checksum, - md5_checksum)) - return bytes_content + "{}".format(file_md5.hexdigest(), + expected_md5)) + return fsrc_bytes if data_home is None: fsrc = urlopen(req) if is_gzip(fsrc): fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb') - bytes_content = fsrc.read() - if expected_md5_checksum: - # validating checksum reads and consumes the stream - return _md5_validated_bytes(bytes_content, expected_md5_checksum) - return bytes_content + return _md5_validated_bytestream( + fsrc, + expected_md5=expected_md5_checksum + ) local_path = _get_local_path(openml_path, data_home) if not os.path.exists(local_path): @@ -131,10 +151,10 @@ def _md5_validated_bytes(bytes_content, md5_checksum): with closing(urlopen(req)) as fsrc: if is_gzip(fsrc): # unzip it for checksum validation fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb') - bytes_content = fsrc.read() - if expected_md5_checksum: - bytes_content = _md5_validated_bytes(bytes_content, - expected_md5_checksum) + bytes_content = _md5_validated_bytestream( + fsrc, + expected_md5=expected_md5_checksum + ) with gzip.GzipFile(local_path, 'wb') as fdst: fdst.write(bytes_content) except Exception: From f8c8fe4c1aa88d03cbe03e7afad0c5708938e0e0 Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Thu, 10 Oct 2019 20:19:06 -0400 Subject: [PATCH 07/25] bytearray extend while chunked construction --- sklearn/datasets/openml.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 612d769db94dc..2cf5b85fdd6b1 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -114,7 +114,7 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512): ValueError : if expected_md5 does not match actual md5-checksum of stream """ - fsrc_bytes = bytes() + fsrc_bytes = bytearray() file_md5 = hashlib.md5() if expected_md5_checksum else None while True: data = fsrc.read(chunk_size) @@ -122,13 +122,13 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512): break if expected_md5: file_md5.update(data) - fsrc_bytes += data + fsrc_bytes.extend(data) if expected_md5 and file_md5.hexdigest() != expected_md5: raise ValueError("md5checksum: {} does not match expected: " "{}".format(file_md5.hexdigest(), expected_md5)) - return fsrc_bytes + return bytes(fsrc_bytes) if data_home is None: fsrc = urlopen(req) From 3c5ab3e1a69cba565f8d70ade947771b1f70169f Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Fri, 11 Oct 2019 19:10:58 -0400 Subject: [PATCH 08/25] Update sklearn/datasets/openml.py return bytearray instead of new bytes Co-Authored-By: Thomas J Fan --- sklearn/datasets/openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 2cf5b85fdd6b1..1820c759657a5 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -128,7 +128,7 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512): raise ValueError("md5checksum: {} does not match expected: " "{}".format(file_md5.hexdigest(), expected_md5)) - return bytes(fsrc_bytes) + return fsrc_bytes if data_home is None: fsrc = urlopen(req) From db159a5f6f587a924d844b5b0492ad64cb887f0f Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Mon, 2 Dec 2019 19:06:02 -0500 Subject: [PATCH 09/25] add early exiting --- sklearn/datasets/_openml.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index c3e10f2c828cf..47ca1b604be88 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -114,17 +114,19 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512): ValueError : if expected_md5 does not match actual md5-checksum of stream """ + if expected_md5 is None: + return fsrc.read() + fsrc_bytes = bytearray() - file_md5 = hashlib.md5() if expected_md5_checksum else None + file_md5 = hashlib.md5() while True: data = fsrc.read(chunk_size) if not data: break - if expected_md5: - file_md5.update(data) + file_md5.update(data) fsrc_bytes.extend(data) - if expected_md5 and file_md5.hexdigest() != expected_md5: + if file_md5.hexdigest() != expected_md5: raise ValueError("md5checksum: {} does not match expected: " "{}".format(file_md5.hexdigest(), expected_md5)) From f745e2113d2eb0339b400056348d35c9fe3d9bb9 Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Fri, 3 Jan 2020 20:36:26 -0500 Subject: [PATCH 10/25] revert back to simple case --- sklearn/datasets/_openml.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 47ca1b604be88..88ebd504b2d64 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -114,23 +114,16 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512): ValueError : if expected_md5 does not match actual md5-checksum of stream """ + bytes = fsrc.read() + if expected_md5 is None: - return fsrc.read() - - fsrc_bytes = bytearray() - file_md5 = hashlib.md5() - while True: - data = fsrc.read(chunk_size) - if not data: - break - file_md5.update(data) - fsrc_bytes.extend(data) - - if file_md5.hexdigest() != expected_md5: + return bytes + + file_md5 = hashlib.md5(bytes).hexdigest() + if file_md5 != expected_md5: raise ValueError("md5checksum: {} does not match expected: " - "{}".format(file_md5.hexdigest(), - expected_md5)) - return fsrc_bytes + "{}".format(file_md5, expected_md5)) + return bytes if data_home is None: fsrc = urlopen(req) From 343303178773d98430d44ac0306f001714f15b0e Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sat, 2 May 2020 23:43:39 -0400 Subject: [PATCH 11/25] Merge upstream, verify checksum while yielding stream --- sklearn/datasets/_openml.py | 343 +++++++++++++------------- sklearn/datasets/tests/test_openml.py | 111 ++++----- 2 files changed, 220 insertions(+), 234 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 88ebd504b2d64..c5a8da376403d 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -1,6 +1,7 @@ import gzip import json import os +import shutil import hashlib from os.path import join from warnings import warn @@ -9,6 +10,7 @@ import itertools from collections.abc import Generator from collections import OrderedDict +from functools import partial from urllib.request import urlopen, Request @@ -22,6 +24,7 @@ from ..utils import get_chunk_n_rows from ..utils import _chunk_generator from ..utils import check_pandas_support # noqa +from ..utils.validation import _deprecate_positional_args __all__ = ['fetch_openml'] @@ -44,11 +47,11 @@ def _retry_with_clean_cache(openml_path, data_home): """ def decorator(f): @wraps(f) - def wrapper(): + def wrapper(*args, **kw): if data_home is None: - return f() + return f(*args, **kw) try: - return f() + return f(*args, **kw) except HTTPError: raise except Exception: @@ -56,12 +59,12 @@ def wrapper(): local_path = _get_local_path(openml_path, data_home) if os.path.exists(local_path): os.unlink(local_path) - return f() + return f(*args, **kw) return wrapper return decorator -def _open_openml_url(openml_path, data_home, expected_md5_checksum=None): +def _open_openml_url(openml_path, data_home): """ Returns a resource from OpenML.org. Caches it to data_home if required. @@ -77,8 +80,8 @@ def _open_openml_url(openml_path, data_home, expected_md5_checksum=None): Returns ------- - result : bytes - Byte content of resource + result : stream + A stream to the OpenML resource """ def is_gzip(_fsrc): return _fsrc.info().get('Content-Encoding', '') == 'gzip' @@ -86,53 +89,11 @@ def is_gzip(_fsrc): req = Request(_OPENML_PREFIX + openml_path) req.add_header('Accept-encoding', 'gzip') - def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512): - """ - Takes in a byte-stream, reads in chunks and returns bytes. - If expected_md5 is not none, keeps md5 checksum state while streaming - and validates post stream consumption. - - Parameters - ---------- - fsrc : io.BufferedIOBase - input stream to read bytes from - - expected_md5 : str - expected md5-checksum value - - chunk_size : int - size of chunks to read at a time from stream - - Returns - ------- - fsrc_bytes : bytes - equivalent to fsrc_bytes.read() but with md5 validation if - expected_md5 is provided - - Raises - ------ - ValueError : - if expected_md5 does not match actual md5-checksum of stream - """ - bytes = fsrc.read() - - if expected_md5 is None: - return bytes - - file_md5 = hashlib.md5(bytes).hexdigest() - if file_md5 != expected_md5: - raise ValueError("md5checksum: {} does not match expected: " - "{}".format(file_md5, expected_md5)) - return bytes - if data_home is None: fsrc = urlopen(req) if is_gzip(fsrc): - fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb') - return _md5_validated_bytestream( - fsrc, - expected_md5=expected_md5_checksum - ) + return gzip.GzipFile(fileobj=fsrc, mode='rb') + return fsrc local_path = _get_local_path(openml_path, data_home) if not os.path.exists(local_path): @@ -144,25 +105,20 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512): try: with closing(urlopen(req)) as fsrc: - if is_gzip(fsrc): # unzip it for checksum validation - fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb') - bytes_content = _md5_validated_bytestream( - fsrc, - expected_md5=expected_md5_checksum - ) - with gzip.GzipFile(local_path, 'wb') as fdst: - fdst.write(bytes_content) + if is_gzip(fsrc): + with open(local_path, 'wb') as fdst: + shutil.copyfileobj(fsrc, fdst) + else: + with gzip.GzipFile(local_path, 'wb') as fdst: + shutil.copyfileobj(fsrc, fdst) except Exception: if os.path.exists(local_path): os.unlink(local_path) raise - else: - with gzip.GzipFile(local_path, "rb") as gzip_file: - bytes_content = gzip_file.read() # XXX: First time, decompression will not be necessary (by using fsrc), but # it will happen nonetheless - return bytes_content + return gzip.GzipFile(local_path, 'rb') def _get_json_content_from_openml_api(url, error_message, raise_if_error, @@ -199,7 +155,8 @@ def _get_json_content_from_openml_api(url, error_message, raise_if_error, @_retry_with_clean_cache(url, data_home) def _load_json(): - return json.loads(_open_openml_url(url, data_home).decode("utf-8")) + with closing(_open_openml_url(url, data_home)) as response: + return json.loads(response.read().decode("utf-8")) try: return _load_json() @@ -263,7 +220,7 @@ def _sparse_data_to_array(arff_data, include_columns): return y -def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None): +def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None): """ converts the arff object into the appropriate matrix type (np.array or scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the @@ -271,8 +228,8 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None): Parameters ---------- - arff_data : list or dict - as obtained from liac-arff object + arff : dict + As obtained from liac-arff object. col_slice_x : list The column indices that are sliced from the original array to return @@ -287,6 +244,7 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None): X : np.array or scipy.sparse.csr_matrix y : np.array """ + arff_data = arff['data'] if isinstance(arff_data, Generator): if shape[0] == -1: count = -1 @@ -346,7 +304,8 @@ def _convert_arff_data_dataframe(arff, columns, features_dict): Returns ------- - dataframe : pandas DataFrame + result : tuple + tuple with the resulting dataframe """ pd = check_pandas_support('fetch_openml with as_frame=True') @@ -373,7 +332,7 @@ def _convert_arff_data_dataframe(arff, columns, features_dict): if dtype == 'category': dtype = pd.api.types.CategoricalDtype(attributes[column]) df[column] = df[column].astype(dtype, copy=False) - return df + return (df, ) def _get_data_info_by_name(name, version, data_home): @@ -494,29 +453,131 @@ def _get_num_samples(data_qualities): return int(float(qualities.get('NumberOfInstances', default_n_samples))) -def _download_data_arff(file_id, sparse, data_home, encode_nominal=True, - expected_md5_checksum=None): - # Accesses an ARFF file on the OpenML server. Documentation: - # https://www.openml.org/api_data_docs#!/data/get_download_id - # encode_nominal argument is to ensure unit testing, do not alter in - # production! - url = _DATA_FILE.format(file_id) +def _load_arff_response(url, data_home, return_type, encode_nominal, + parse_arff, md5_checksum): + """Load arff data with url and parses arff response with parse_arff""" + response = _open_openml_url(url, data_home) + + with closing(response): + # Note that if the data is dense, no reading is done until the data + # generator is iterated. + actual_md5_checksum = hashlib.md5() + + def _stream_checksum_generator(response): + for line in response: + actual_md5_checksum.update(line) + yield line.decode('utf-8') + # stream consumed, check md5 + if actual_md5_checksum.hexdigest() != md5_checksum: + raise ValueError("md5 checksum of local file for " + url + + " does not match description. " + "Downloaded file could have been modified / corrupted," + "clean cache and retry...") + + arff = _arff.load(_stream_checksum_generator(response), + return_type=return_type, + encode_nominal=encode_nominal) + return parse_arff(arff) + + +def _download_data_to_bunch(url, sparse, data_home, *, as_frame, features_list, data_columns, target_columns, shape, + md5_checksum): + """Download OpenML ARFF and convert to Bunch of data""" + # NB: this function is long in order to handle retry for any failure + # during the streaming parse of the ARFF. + + # Prepare which columns and data types should be returned for the X and y + features_dict = {feature['name']: feature for feature in features_list} - @_retry_with_clean_cache(url, data_home) - def _arff_load(): - bytes_content = _open_openml_url(url, data_home, - expected_md5_checksum) - if sparse is True: - return_type = _arff.COO - else: - return_type = _arff.DENSE_GEN + # XXX: col_slice_y should be all nominal or all numeric + _verify_target_data_type(features_dict, target_columns) + + col_slice_y = [int(features_dict[col_name]['index']) + for col_name in target_columns] + + col_slice_x = [int(features_dict[col_name]['index']) + for col_name in data_columns] + for col_idx in col_slice_y: + feat = features_list[col_idx] + nr_missing = int(feat['number_of_missing_values']) + if nr_missing > 0: + raise ValueError('Target column {} has {} missing values. ' + 'Missing values are not supported for target ' + 'columns. '.format(feat['name'], nr_missing)) - arff_file = _arff.loads(bytes_content.decode('utf-8'), - encode_nominal=encode_nominal, - return_type=return_type) - return arff_file + # Access an ARFF file on the OpenML server. Documentation: + # https://www.openml.org/api_data_docs#!/data/get_download_id - return _arff_load() + if sparse is True: + return_type = _arff.COO + else: + return_type = _arff.DENSE_GEN + + frame = nominal_attributes = None + if as_frame: + columns = data_columns + target_columns + parse_arff = partial(_convert_arff_data_dataframe, columns=columns, + features_dict=features_dict) + + def postprocess(frame): # type:ignore + X = frame[data_columns] + if len(target_columns) >= 2: + y = frame[target_columns] + elif len(target_columns) == 1: + y = frame[target_columns[0]] + else: + y = None + return X, y, frame, nominal_attributes + else: + def parse_arff(arff): + X, y = _convert_arff_data(arff, col_slice_x, col_slice_y, shape) + # nominal attributes is a dict mapping from the attribute name to + # the possible values. Includes also the target column (which will + # be popped off below, before it will be packed in the Bunch + # object) + nominal_attributes = {k: v for k, v in arff['attributes'] + if isinstance(v, list) and + k in data_columns + target_columns} + return X, y, nominal_attributes + + def postprocess(X, y, nominal_attributes): # type:ignore + is_classification = {col_name in nominal_attributes + for col_name in target_columns} + if not is_classification: + # No target + pass + elif all(is_classification): + y = np.hstack([ + np.take( + np.asarray(nominal_attributes.pop(col_name), + dtype='O'), + y[:, i:i + 1].astype(int, copy=False)) + for i, col_name in enumerate(target_columns) + ]) + elif any(is_classification): + raise ValueError('Mix of nominal and non-nominal targets is ' + 'not currently supported') + + # reshape y back to 1-D array, if there is only 1 target column; + # back to None if there are not target columns + if y.shape[1] == 1: + y = y.reshape((-1,)) + elif y.shape[1] == 0: + y = None + return X, y, frame, nominal_attributes + + out = _retry_with_clean_cache(url, data_home)( + _load_arff_response)(url, data_home, + return_type=return_type, + encode_nominal=not as_frame, + parse_arff=parse_arff, + md5_checksum=md5_checksum) + X, y, frame, nominal_attributes = postprocess(*out) + + return Bunch(data=X, target=y, frame=frame, + categories=nominal_attributes, + feature_names=data_columns, + target_names=target_columns) def _verify_target_data_type(features_dict, target_columns): @@ -561,9 +622,10 @@ def _valid_data_column_names(features_list, target_columns): return valid_data_column_names -def fetch_openml(name=None, version='active', data_id=None, data_home=None, +@_deprecate_positional_args +def fetch_openml(name=None, *, version='active', data_id=None, data_home=None, target_column='default-target', cache=True, return_X_y=False, - as_frame=False, verify_checksum=True): + as_frame=False): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a @@ -574,6 +636,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + .. note:: EXPERIMENTAL The API is experimental (particularly the return value structure), @@ -624,16 +688,11 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas DataFrames or Series as describe above. - verify_checksum : boolean, default=True - If True, verifies md5_checksum of file provided in /download/{id} - If cache=True, verification only happens during data download - from network. - Returns ------- - data : Bunch - Dictionary-like object, with attributes: + data : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame The feature matrix. Categorical features are encoded as ordinals. @@ -756,25 +815,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, data_columns = _valid_data_column_names(features_list, target_columns) - # prepare which columns and data types should be returned for the X and y - features_dict = {feature['name']: feature for feature in features_list} - - # XXX: col_slice_y should be all nominal or all numeric - _verify_target_data_type(features_dict, target_columns) - - col_slice_y = [int(features_dict[col_name]['index']) - for col_name in target_columns] - - col_slice_x = [int(features_dict[col_name]['index']) - for col_name in data_columns] - for col_idx in col_slice_y: - feat = features_list[col_idx] - nr_missing = int(feat['number_of_missing_values']) - if nr_missing > 0: - raise ValueError('Target column {} has {} missing values. ' - 'Missing values are not supported for target ' - 'columns. '.format(feat['name'], nr_missing)) - # determine arff encoding to return if not return_sparse: # The shape must include the ignored features to keep the right indexes @@ -785,69 +825,22 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, shape = None # obtain the data - arff = _download_data_arff(data_description['file_id'], - return_sparse, - data_home, - encode_nominal=not as_frame, - expected_md5_checksum=data_description["md5_checksum"] if verify_checksum else None) # noqa: E501 + url = _DATA_FILE.format(data_description['file_id']) + bunch = _download_data_to_bunch(url, return_sparse, data_home, + as_frame=as_frame, + features_list=features_list, shape=shape, + target_columns=target_columns, + data_columns=data_columns, + md5_checksum=data_description["md5_checksum"]) + + if return_X_y: + return bunch.data, bunch.target description = "{}\n\nDownloaded from openml.org.".format( data_description.pop('description')) - nominal_attributes = None - frame = None - if as_frame: - columns = data_columns + target_columns - frame = _convert_arff_data_dataframe(arff, columns, features_dict) - X = frame[data_columns] - if len(target_columns) >= 2: - y = frame[target_columns] - elif len(target_columns) == 1: - y = frame[target_columns[0]] - else: - y = None - else: - # nominal attributes is a dict mapping from the attribute name to the - # possible values. Includes also the target column (which will be - # popped off below, before it will be packed in the Bunch object) - nominal_attributes = {k: v for k, v in arff['attributes'] - if isinstance(v, list) and - k in data_columns + target_columns} - - X, y = _convert_arff_data(arff['data'], col_slice_x, - col_slice_y, shape) - - is_classification = {col_name in nominal_attributes - for col_name in target_columns} - if not is_classification: - # No target - pass - elif all(is_classification): - y = np.hstack([ - np.take( - np.asarray(nominal_attributes.pop(col_name), dtype='O'), - y[:, i:i + 1].astype(int, copy=False)) - for i, col_name in enumerate(target_columns) - ]) - elif any(is_classification): - raise ValueError('Mix of nominal and non-nominal targets is not ' - 'currently supported') - - # reshape y back to 1-D array, if there is only 1 target column; back - # to None if there are not target columns - if y.shape[1] == 1: - y = y.reshape((-1,)) - elif y.shape[1] == 0: - y = None - - if return_X_y: - return X, y - - bunch = Bunch( - data=X, target=y, frame=frame, feature_names=data_columns, - target_names=target_columns, + bunch.update( DESCR=description, details=data_description, - categories=nominal_attributes, url="https://www.openml.org/d/{}".format(data_id)) return bunch diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index c713e4087ed42..8187dc3feb9db 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -13,8 +13,9 @@ from sklearn import config_context from sklearn.datasets import fetch_openml from sklearn.datasets._openml import (_open_openml_url, + _arff, + _DATA_FILE, _get_data_description_by_id, - _download_data_arff, _get_local_path, _retry_with_clean_cache, _feature_to_dtype) @@ -49,8 +50,7 @@ def decode_column(data_bunch, col_idx): # non-nominal attribute return data_bunch.data[:, col_idx] - data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None, - verify_checksum=False) + data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None) # also obtain decoded arff data_description = _get_data_description_by_id(data_id, None) @@ -58,8 +58,13 @@ def decode_column(data_bunch, col_idx): if sparse is True: raise ValueError('This test is not intended for sparse data, to keep ' 'code relatively simple') - data_arff = _download_data_arff(data_description['file_id'], - sparse, None, False) + url = _DATA_FILE.format(data_description['file_id']) + with _open_openml_url(url, data_home=None) as f: + data_arff = _arff.load((line.decode('utf-8') for line in f), + return_type=(_arff.COO if sparse + else _arff.DENSE_GEN), + encode_nominal=False) + data_downloaded = np.array(list(data_arff['data']), dtype='O') for i in range(len(data_bunch.feature_names)): @@ -81,19 +86,18 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, # result. Note that this function can be mocked (by invoking # _monkey_patch_webbased_functions before invoking this function) data_by_name_id = fetch_openml(name=data_name, version=data_version, - cache=False, verify_checksum=False) + cache=False) assert int(data_by_name_id.details['id']) == data_id # Please note that cache=False is crucial, as the monkey patched files are # not consistent with reality - fetch_openml(name=data_name, cache=False, verify_checksum=False) + fetch_openml(name=data_name, cache=False) # without specifying the version, there is no guarantee that the data id # will be the same # fetch with dataset id data_by_id = fetch_openml(data_id=data_id, cache=False, - target_column=target_column, - verify_checksum=False) + target_column=target_column) assert data_by_id.details['name'] == data_name assert data_by_id.data.shape == (expected_observations, expected_features) if isinstance(target_column, str): @@ -105,7 +109,7 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, assert data_by_id.target.shape == (expected_observations, len(target_column)) assert data_by_id.target_names == target_column - assert data_by_id.data.dtype == np.float64 + assert data_by_id.data.dtype == expected_data_dtype assert data_by_id.target.dtype == expected_target_dtype assert len(data_by_id.feature_names) == expected_features for feature in data_by_id.feature_names: @@ -120,13 +124,8 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, if compare_default_target: # check whether the data by id and data by id target are equal - data_by_id_default = fetch_openml(data_id=data_id, cache=False, - verify_checksum=False) - if data_by_id.data.dtype == np.float64: - np.testing.assert_allclose(data_by_id.data, - data_by_id_default.data) - else: - assert np.array_equal(data_by_id.data, data_by_id_default.data) + data_by_id_default = fetch_openml(data_id=data_id, cache=False) + np.testing.assert_allclose(data_by_id.data, data_by_id_default.data) if data_by_id.target.dtype == np.float64: np.testing.assert_allclose(data_by_id.target, data_by_id_default.target) @@ -143,7 +142,7 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, # test return_X_y option fetch_func = partial(fetch_openml, data_id=data_id, cache=False, - target_column=target_column, verify_checksum=False) + target_column=target_column) check_return_X_y(data_by_id, fetch_func) return data_by_id @@ -184,6 +183,15 @@ def info(self): return {'Content-Encoding': 'gzip'} return {} + def __iter__(self): + return iter(self.data) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + def _file_name(url, suffix): return (re.sub(r'\W', '-', url[len("https://openml.org/"):]) + suffix + path_suffix) @@ -407,8 +415,7 @@ def test_fetch_openml_anneal_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, - target_column=target_column, cache=False, - verify_checksum=False) + target_column=target_column, cache=False) data = bunch.data target = bunch.target frame = bunch.frame @@ -492,8 +499,7 @@ def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch): msg = 'Could not adhere to working_memory config.' with pytest.warns(UserWarning, match=msg): with config_context(working_memory=1e-6): - fetch_openml(data_id=data_id, as_frame=True, cache=False, - verify_checksum=False) + fetch_openml(data_id=data_id, as_frame=True, cache=False) def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch): @@ -510,7 +516,7 @@ def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) X, y = fetch_openml(data_id=data_id, as_frame=True, cache=False, - return_X_y=True, verify_checksum=False) + return_X_y=True) assert isinstance(X, pd.DataFrame) assert X.shape == data_shape n_categories = len([dtype for dtype in X.dtypes @@ -539,8 +545,7 @@ def test_fetch_openml_adultcensus_pandas(monkeypatch): target_column = 'class' _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, - verify_checksum=False) + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame @@ -578,8 +583,7 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch): frame_n_floats = 77 _monkey_patch_webbased_functions(monkeypatch, data_id, True) - bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, - verify_checksum=False) + bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False) data = bunch.data target = bunch.target frame = bunch.frame @@ -619,7 +623,7 @@ def test_fetch_openml_emotions_pandas(monkeypatch): _monkey_patch_webbased_functions(monkeypatch, data_id, True) bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False, - target_column=target_column, verify_checksum=False) + target_column=target_column) data = bunch.data target = bunch.target frame = bunch.frame @@ -748,7 +752,7 @@ def test_fetch_openml_iris_multitarget(monkeypatch, gzip_response): _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, - object, np.float64, expect_sparse=False, + np.float64, np.float64, expect_sparse=False, compare_default_target=False) @@ -767,7 +771,7 @@ def test_fetch_openml_anneal(monkeypatch, gzip_response): _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, - object, object, expect_sparse=False, + np.float64, object, expect_sparse=False, compare_default_target=True) @@ -792,7 +796,7 @@ def test_fetch_openml_anneal_multitarget(monkeypatch, gzip_response): _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, - object, object, expect_sparse=False, + np.float64, object, expect_sparse=False, compare_default_target=False) @@ -810,7 +814,7 @@ def test_fetch_openml_cpu(monkeypatch, gzip_response): _fetch_dataset_from_openml(data_id, data_name, data_version, target_column, expected_observations, expected_features, expected_missing, - object, np.float64, expect_sparse=False, + np.float64, np.float64, expect_sparse=False, compare_default_target=True) @@ -934,7 +938,7 @@ def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir): assert os.path.isfile(location) # redownload, to utilize cache response2 = _open_openml_url(openml_path, cache_directory) - assert response1 == response2 + assert response1.read() == response2.read() @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1011,8 +1015,7 @@ def _mock_urlopen_raise(request): monkeypatch, data_id, gzip_response) X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True, data_home=cache_directory, - return_X_y=True, - verify_checksum=False) + return_X_y=True) monkeypatch.setattr(sklearn.datasets._openml, 'urlopen', _mock_urlopen_raise) @@ -1045,13 +1048,12 @@ def test_fetch_openml_inactive(monkeypatch, gzip_response): _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) glas2 = assert_warns_message( UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, - data_id=data_id, cache=False, verify_checksum=False) + data_id=data_id, cache=False) # fetch inactive dataset by name and version assert glas2.data.shape == (163, 9) glas2_by_version = assert_warns_message( UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml, - data_id=None, name="glass2", version=1, cache=False, - verify_checksum=False) + data_id=None, name="glass2", version=1, cache=False) assert int(glas2_by_version.details['id']) == data_id @@ -1062,8 +1064,7 @@ def test_fetch_nonexiting(monkeypatch, gzip_response): _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) # Note that we only want to search by name (not data id) assert_raise_message(ValueError, "No active dataset glass2 found", - fetch_openml, name='glass2', cache=False, - verify_checksum=False) + fetch_openml, name='glass2', cache=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1075,8 +1076,7 @@ def test_raises_illegal_multitarget(monkeypatch, gzip_response): assert_raise_message(ValueError, "Can only handle homogeneous multi-target datasets,", fetch_openml, data_id=data_id, - target_column=targets, cache=False, - verify_checksum=False) + target_column=targets, cache=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1089,20 +1089,20 @@ def test_warn_ignore_attribute(monkeypatch, gzip_response): assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), fetch_openml, data_id=data_id, target_column='MouseID', - cache=False, verify_checksum=False) + cache=False) assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'), fetch_openml, data_id=data_id, target_column='Genotype', - cache=False, verify_checksum=False) + cache=False) # multi column test assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'), fetch_openml, data_id=data_id, target_column=['MouseID', 'class'], - cache=False, verify_checksum=False) + cache=False) assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'), fetch_openml, data_id=data_id, target_column=['Genotype', 'class'], - cache=False, verify_checksum=False) + cache=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1113,8 +1113,7 @@ def test_string_attribute_without_dataframe(monkeypatch, gzip_response): assert_raise_message(ValueError, ('STRING attributes are not supported for ' 'array representation. Try as_frame=True'), - fetch_openml, data_id=data_id, cache=False, - verify_checksum=False) + fetch_openml, data_id=data_id, cache=False) @pytest.mark.parametrize('gzip_response', [True, False]) @@ -1125,7 +1124,7 @@ def test_dataset_with_openml_error(monkeypatch, gzip_response): UserWarning, "OpenML registered a problem with the dataset. It might be unusable. " "Error:", - fetch_openml, data_id=data_id, cache=False, verify_checksum=False + fetch_openml, data_id=data_id, cache=False ) @@ -1212,18 +1211,12 @@ def test_fetch_openml_verify_checksum(monkeypatch): data[len(data)-1] = 37 modified_gzip.write(data) - # succeeds if checksum validation is explicitly overriden - dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False, - verify_checksum=False) - assert dataset - - # fails with checksum validation (default) + # should fail checksum validation with pytest.raises(ValueError) as exc: sklearn.datasets.fetch_openml(data_id=data_id, cache=False, - as_frame=True, - verify_checksum=True) - - assert exc + as_frame=True) + # exception message should have file-path + assert "1666876" in repr(exc) finally: shutil.copy(backup_data_path, original_data_path) os.remove(backup_data_path) From c7774ef76df968d2deb226aed2a7a66dd41d9287 Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sun, 3 May 2020 01:16:36 -0400 Subject: [PATCH 12/25] Update checksum for truncated local test files --- .../data/openml/1/api-v1-json-data-1.json.gz | Bin 1785 -> 1786 bytes .../openml/292/api-v1-json-data-292.json.gz | Bin 547 -> 551 bytes .../openml/292/api-v1-json-data-40981.json.gz | Bin 547 -> 553 bytes 3 files changed, 0 insertions(+), 0 deletions(-) diff --git a/sklearn/datasets/tests/data/openml/1/api-v1-json-data-1.json.gz b/sklearn/datasets/tests/data/openml/1/api-v1-json-data-1.json.gz index f75912bf2def733f38da398a5155a89bc110dc51..ba544db4916374d870531fbfcdb4e88718569d30 100644 GIT binary patch delta 1780 zcmVYIr`fm{qF}9X<3S$hokW%67UfOK7s`C&&iuXL?vBSOs*nUg@Tuo z7x`cg|D3#eGvE=v4eS9?A34pK`lNZzXsTr!CMt~|I9k%TGB`iKtYuP&^Yb}>L`6;| ziy(N0cXY!eSV$&Pp09!C5quN$8LsMF@|*!I^YimXQHzkNk-ry$uX#>WID5FhdHcs% zB4s&Wj+UeCb~`LemS^d(5bNbAT8^BzV1c%9n=)E42)S&`DB&`{Pk06RqfiP~iXvoH zgx48Eg~2o`_894M22%fSyi?C+K%`U5D=kTN*i4YJ=3;WAAj<5i^s+R(IS zIE7#!1#6&e z7ZTP*5uy7ulssfat9DMQSEe)@HSZ7}aBS-*@Vt_YMv4e~2=`p3OojY^4hIJ~d=8gQ zoGJ(oJDsc)dFtCEFM$G$~&Qx;3*%3V=a@Jr2eL}R#M&!#?51!US5J`*JV zGYu1!Yd{5LOfM}Nl^{~ZcZae=mtqIAWT^Jq}S>{ERJCjCMnM{CR%FBYd5s1FGh>RSG9;~ z$gG&>yi$g4krvD7oxT3Anrde(@{YVGABdsYMv9p_sMC(S2hQ3^ZV+}?0iWO6d63m0 zepQQf2J%%c(iv=*ql|SCSDhe`%^-bsII@_%Q0}!3(ibLwqUa&6TP@O8CPmd7e%or1 zyb!Y~>ZsB}MuW*xmCHqj3#I#oFHa}(*N&K@GMVuYex=3R6D#flM@pY}rz=W2)#l8OT z&CdiDpWeTJH(Sa@GVufqpSq}*Tfs$6w>;`>mSZ5KHf4F=p92swT<{kuWs%=@+saiO z7d@%~#`uu&h5F;EKfX6MXr#8J;Zw6e&BoChEXa6I*i+Z;De|BI?Wuc@mxoj9w0i}> zVvPGWeTGx4-78v_DGw=@ixwm(7AJwM(Y!4xr{*qyvC^1RI8#Nn!g_>dE$H1VQntoB zMa1#u#pJ%o9U50)tZ-XZ?bIcQ1{K44q)HEM$Ln!#O#wqbKATI;zG{4JEYD%V*Ex?l zdeM}}<>1HVKu_Odk*|GyOh z{f^Lo@og=fc7;=u1hWH+_PFPZ#@XF4{Fy;^195C$3wxnVwL|u#H{2k*(Rfp1vm1}^ zdRo|x#~aVIup5u(Ia=6_$DipNhtrG5aOCtO(i}P6hhgQ+;<-b1gICXVr*K}#TZP+`d*fCy>5tg$wEp&M zhU^p%eT6$|kF4HfS=Mm{u9j{#_1#tVnU{ATK4@88rUi{w*bOz^sx!7H));@@z2nY* z?b+NJYaE`z>g-dPTfDpT&>4AG$ppQW3|-H=eRp zuYrzOOzTvxBo%Ayx9RSPRFI~+^m^)5(-?L-Yf0Dm_4}Zbb@aSKn_yliop)co)*&DK zp>6Ss>x-Z=RCeOrb=0e-|L;WoYFn*i$6+GLx>Ddylc`mvmsT~a!~WE=$JW@z6J$Q| z=B}%;>cS)x;trcpkUYa0Qq)o_v*=>gbW&BFDa>^2*xrO)uppdyEDmRsO`HqI4r2T_ Wnc0JrH~*ged-6a2I`gW|5C8zcvUb}5 delta 1779 zcmVMTf!{gno>9pn z2p-`zUGoU$l8J<8E1+2fUj%)Giz<^mV*q}3dOFW55i&LMuUzmI&u9WCcb8W$|2~PO zEN1?|A8a<8eqOLFP5QZ5`Gd$GI4{8hZQ(j$v}6!+S({NpKf8^23HYN>2v&$ZWMzq7 znNT2Kf{Yowq7VzV{K@YrlYZ}iXC3w{e?Wf*#TiltC)+{x%RZbZ38cI%RX}T+R1BvO z45VOn)Ktxw*JVfW#d%Y>ru!%lm&8w6E>BeYmEjM3?9ar>e4tEZ<~WH|oO;OW2xbvNH>+ z@Ubj-^Q}l|W~RP^vMLJ}N*((K*-h9|DpzjuvV>0+&k&8_f<2vhKoyW_TlkES_zyIU zRj&QcsK}JlUvlwK#(B|y2e>+pAf{_1J%=!6;e%eQU)gdA^Ds_$nljN?Lr|1g0Wztv z$|V!=Ssy-DX~2Xc`3kwG8RpvzCew)qjN5>7DFqKIsSxg@iF^!Uu4t_Cs!SfOI(ec- z>tc|St(bVKaHiXQ+iCFU^>_0eU>YzjmNezb;|!X|vdFs&^6oc(@@`IUKa%@%a{rdx zzbE$}$o<@m!z97o6jhhI~EKG*^Q})GTgzlXF~Lgb0c&Y7C^)@YFp91(GyR$HIu=+mDD zG3i6iQNR=2kTZjSu!%!lG9qUNHC3?*maSy`tSk1gk1LW{nEcueZR(@ZBJpJnL4PGmb-h-+DL8?wpRh4ugyHjYOs7( zi*yF^SuN5TZ2UpW+K7u*5XfebJUbj&OrI!sS_kP1W07}%5SNV>=_}&A>j3SBuR3cxdhqNw|SUyX!t4Mrat1>XAMQ3p@|G4^{ z!2JE2H)czJxkx6SfZ;iz^$mxbht<7=>gw!T1>-w_?LWXnxB&96!b-S%x#c|%D z3Sfv28J?-XvHJ7ILxVqH=8R5(|xgIffHeR7Nyi-IR zZ(dC9irl_&1;z@uQPoaea%fO8tVgQ!&}O_I_tF$F{>Koq{z~-XFWvS;g)yHlxlF8?Nk=qh^MmE}qo$ZUpEz#IB=b_gU zI-ae6g_E{$Vv=CGXVD&Ze9<_&9fmhG$aWx(?P*~rl!uAYLcCHMBbSZ9*W2v9unGzg!{?ny8EJk{akbRbIt8UGBQGMAJ>UvqL<5knrZmM zbNw|h^}aHl%B4P(wp+gbgpX90-$NJAEwUZFdL~U%aY!svJi0w}6ZohiS zR&m!=xTEIC>O7WZ9ai9K>1I>kT~(iXe)IONmQ_WP(`bR+P~ELMLwjTm@i)G8+?hRp zojF5|!!uZ&eF}4pcXu9|;UZBY)nJx~J51FT4c0ckVeg{m@(1{#+rvH;%cb#+Cv4Ga zpd+@VRU#LXiWTP6lEx1xTwt=8jWFp*?cDsZdG)GE_UtD4nfe`47~Yv|$$G8>I& zuB)-?!Xy;p2AffkJjEK4S5hm}=xkAUQe~AY%*YL#DRo&eoU#etf2VYE=A4CYIB}7q VaMC+^@#E;n(SJsQd~41S002yne&qlF diff --git a/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz b/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz index 3313b6fb4a0a26fd6c4561e80f51b93b4b7390b3..17caf8a8dcc7b1b33bca8f5dfd6d90b66a6b2e3a 100644 GIT binary patch delta 39 ucmZ3?vYbUuzMF%?(0^SV19M_Qrf!*`ZdP%Ao^DEFNusWirO`$yP9^~0L<<%G delta 35 qcmZ3^vY16mzMF%CnJ+n(fjK3yBr!fEwK%ybv!En1KX0QPCldgznhJ6N diff --git a/sklearn/datasets/tests/data/openml/292/api-v1-json-data-40981.json.gz b/sklearn/datasets/tests/data/openml/292/api-v1-json-data-40981.json.gz index 888140f92b36025bdaf00eba9eeb917fd9c3964e..9fd04cf4a73cb3799757bb1c677df329708ed35b 100644 GIT binary patch literal 553 zcmV+^0@nQ>iwFoeQm$SA17UD!Ep{<2YIARHEo5PIVJ$Q;IXE#cYIARH098`kirX*{ zeNMhYxNl(-+gdw~{g#)7LJLc?T_|-iV_A;H%97C~Azk+0D_LosfH7xgj^=V^-j|E0 z1p~FFrf#Vwij9Uz5tZnDxmZN3btu9Gjvl-zV4&Wyx7sM+3`FpssM6t8lm^dzu4uK| zThmLmT9xQvz;Jo2;;*bZynaKd606lgT1BW2{}69No&{PXoTS-0P1k7_IM1Adrid^z z5xuh#FFn)9eWs_VgF#x2=I)T6&5ibe6g7&`6K!v^c1jM@LmoQOF7u`mUko~fMnbB= zW%)zfjV}~B{NFyR;+sO1_=IGK2nVgHE)l*ueEkw_i{xr|6pU4#k!C@t&Q~#Bqr(VF z(|Z5?=_!PWa%`!l*u7@gxa6D?FLCYaOpBR^3luwY3GID2*{#$3CEb=uQsQ0UaYm>h z6X)4B&bmHV+`D>XMqO5M6`#-NjU1^M_(m$%j-RiJPpLz0XTDYyXT=WVDxSsJ=r9?m zuDcG+IIY8NP3(U+(}CrtHlU8~9KpDiH8ynqGf)g1o%JgkE~al>0Va;MyEqhSW82Fe z%x8>~89!z-&TjZQp7H1CcQ|XNS;IIp7vI8oBaC+Yu-bcD3T_1_CvY1=Lz~y6r{qoB rVbYuc-lagGDDc+(A2HcM-gI44Z zGD9^VEmtzB!25EsNOV&heyg9yn0|3i3Ju0Uu3*jH~RjP{wD}ZuXtRP&$B66MtBZdLW?L-VtET4OB zfPc1Upd*8F2F&U}=*_qG05r3jvomXNvvyhy(?=1N7@v8K|nF^q*Bh;_JcAsDEjuGp0V89{{stZ+Yn9*@0+OYcY{qZq| zsP=57r`Y|>u5l#YJYJcG1j$jcd(o>%4fVO zW?bCxOElxp$?tg9-0+4AZr_50@n$f_@57nkZ5i<^IQs#&F*LMgO?yUPjT-`?fn~Wb lXonao_C?u1RKVSKvukKcQMrN1a`CbJ_zTS2bE$~~005Kz4Z;8b From d00a4137110cef2167610767d97125340b1300c6 Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sun, 3 May 2020 01:32:07 -0400 Subject: [PATCH 13/25] Linting updates --- sklearn/datasets/_openml.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index c5a8da376403d..b92f1470f4690 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -471,8 +471,8 @@ def _stream_checksum_generator(response): if actual_md5_checksum.hexdigest() != md5_checksum: raise ValueError("md5 checksum of local file for " + url + " does not match description. " - "Downloaded file could have been modified / corrupted," - "clean cache and retry...") + "Downloaded file could have been modified / " + "corrupted, clean cache and retry...") arff = _arff.load(_stream_checksum_generator(response), return_type=return_type, @@ -480,8 +480,8 @@ def _stream_checksum_generator(response): return parse_arff(arff) -def _download_data_to_bunch(url, sparse, data_home, *, as_frame, features_list, data_columns, target_columns, shape, - md5_checksum): +def _download_data_to_bunch(url, sparse, data_home, *, as_frame, features_list, + data_columns, target_columns, shape, md5_checksum): """Download OpenML ARFF and convert to Bunch of data""" # NB: this function is long in order to handle retry for any failure # during the streaming parse of the ARFF. @@ -831,7 +831,8 @@ def fetch_openml(name=None, *, version='active', data_id=None, data_home=None, features_list=features_list, shape=shape, target_columns=target_columns, data_columns=data_columns, - md5_checksum=data_description["md5_checksum"]) + md5_checksum=data_description[ + "md5_checksum"]) if return_X_y: return bunch.data, bunch.target From 2e744aafdc7c7185c7af7b05a260b0bed0761867 Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sun, 3 May 2020 21:23:11 -0400 Subject: [PATCH 14/25] Fully consume generator, test for non frame case --- sklearn/datasets/_openml.py | 6 +----- sklearn/datasets/tests/test_openml.py | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index b92f1470f4690..fcf3e0a66759e 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -246,12 +246,8 @@ def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None): """ arff_data = arff['data'] if isinstance(arff_data, Generator): - if shape[0] == -1: - count = -1 - else: - count = shape[0] * shape[1] data = np.fromiter(itertools.chain.from_iterable(arff_data), - dtype='float64', count=count) + dtype='float64') data = data.reshape(*shape) X = data[:, col_slice_x] y = data[:, col_slice_y] diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 8187dc3feb9db..92ad6529e1467 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1193,7 +1193,13 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response): assert 'animal' not in dataset['feature_names'] -def test_fetch_openml_verify_checksum(monkeypatch): +@pytest.mark.parametrize('as_frame,cache', [ + (True, True), + (True, False), + (False, True), + (False, False) +]) +def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache): data_id = 2 _monkey_patch_webbased_functions(monkeypatch, data_id, True) @@ -1212,11 +1218,13 @@ def test_fetch_openml_verify_checksum(monkeypatch): modified_gzip.write(data) # should fail checksum validation + if as_frame: + pytest.importorskip('pandas') with pytest.raises(ValueError) as exc: - sklearn.datasets.fetch_openml(data_id=data_id, cache=False, - as_frame=True) + sklearn.datasets.fetch_openml(data_id=data_id, cache=cache, + as_frame=as_frame) # exception message should have file-path - assert "1666876" in repr(exc) + assert "1666876" in str(exc) finally: shutil.copy(backup_data_path, original_data_path) os.remove(backup_data_path) From 1e6efdd44585cb8f6731dd69bb72966db96746ac Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sun, 3 May 2020 22:03:17 -0400 Subject: [PATCH 15/25] Cross platform assert in test --- sklearn/datasets/tests/test_openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 92ad6529e1467..7955efdda8e39 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1224,7 +1224,7 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache): sklearn.datasets.fetch_openml(data_id=data_id, cache=cache, as_frame=as_frame) # exception message should have file-path - assert "1666876" in str(exc) + assert exc.match("1666876") finally: shutil.copy(backup_data_path, original_data_path) os.remove(backup_data_path) From bf937a3cb1883ea73d41729c589a7d5b2caa7b8c Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Mon, 25 May 2020 18:31:52 -0400 Subject: [PATCH 16/25] Intentionally reach end-of-stream checksum validation --- sklearn/datasets/_openml.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index fcf3e0a66759e..454df78ab4646 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -246,8 +246,12 @@ def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None): """ arff_data = arff['data'] if isinstance(arff_data, Generator): + if shape[0] == -1: + count = -1 + else: + count = shape[0] * shape[1] data = np.fromiter(itertools.chain.from_iterable(arff_data), - dtype='float64') + dtype='float64', count=count) data = data.reshape(*shape) X = data[:, col_slice_x] y = data[:, col_slice_y] @@ -463,17 +467,26 @@ def _stream_checksum_generator(response): for line in response: actual_md5_checksum.update(line) yield line.decode('utf-8') - # stream consumed, check md5 - if actual_md5_checksum.hexdigest() != md5_checksum: - raise ValueError("md5 checksum of local file for " + url - + " does not match description. " - "Downloaded file could have been modified / " - "corrupted, clean cache and retry...") - - arff = _arff.load(_stream_checksum_generator(response), + + stream = _stream_checksum_generator(response) + + arff = _arff.load(stream, return_type=return_type, encode_nominal=encode_nominal) - return parse_arff(arff) + + parsed_arff = parse_arff(arff) + + # consume remaining stream, if early exited + for _ in stream: + pass + + if actual_md5_checksum.hexdigest() != md5_checksum: + raise ValueError("md5 checksum of local file for " + url + + " does not match description. " + "Downloaded file could have been modified / " + "corrupted, clean cache and retry...") + + return parsed_arff def _download_data_to_bunch(url, sparse, data_home, *, as_frame, features_list, From eac5a1edf290f57662de4bc467921c85dd52b26b Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sat, 6 Jun 2020 13:59:21 -0400 Subject: [PATCH 17/25] Test should not modify local test-suite shared file --- sklearn/datasets/tests/test_openml.py | 124 +++++++++++++------------- 1 file changed, 64 insertions(+), 60 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 7955efdda8e39..64dafdded2661 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -8,7 +8,6 @@ import scipy.sparse import sklearn import pytest -import shutil from sklearn import config_context from sklearn.datasets import fetch_openml @@ -147,6 +146,38 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, return data_by_id +class MockHTTPResponse: + def __init__(self, data, is_gzip): + self.data = data + self.is_gzip = is_gzip + + def read(self, amt=-1): + return self.data.read(amt) + + def tell(self): + return self.data.tell() + + def seek(self, pos, whence=0): + return self.data.seek(pos, whence) + + def close(self): + self.data.close() + + def info(self): + if self.is_gzip: + return {'Content-Encoding': 'gzip'} + return {} + + def __iter__(self): + return iter(self.data) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + + def _monkey_patch_webbased_functions(context, data_id, gzip_response): @@ -161,37 +192,6 @@ def _monkey_patch_webbased_functions(context, path_suffix = '.gz' read_fn = gzip.open - class MockHTTPResponse: - def __init__(self, data, is_gzip): - self.data = data - self.is_gzip = is_gzip - - def read(self, amt=-1): - return self.data.read(amt) - - def tell(self): - return self.data.tell() - - def seek(self, pos, whence=0): - return self.data.seek(pos, whence) - - def close(self): - self.data.close() - - def info(self): - if self.is_gzip: - return {'Content-Encoding': 'gzip'} - return {} - - def __iter__(self): - return iter(self.data) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - return False - def _file_name(url, suffix): return (re.sub(r'\W', '-', url[len("https://openml.org/"):]) + suffix + path_suffix) @@ -1193,38 +1193,42 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response): assert 'animal' not in dataset['feature_names'] -@pytest.mark.parametrize('as_frame,cache', [ - (True, True), - (True, False), - (False, True), - (False, False) -]) +@pytest.mark.parametrize('as_frame', [True, False]) def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache): data_id = 2 _monkey_patch_webbased_functions(monkeypatch, data_id, True) - # modify a mocked file content to change checksum + # create a modified (local) arff file dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id)) original_data_path = os.path.join(dataset_dir, 'data-v1-download-1666876.arff.gz') - backup_data_path = original_data_path + ".back" - shutil.copy(original_data_path, backup_data_path) - - try: - with gzip.GzipFile(backup_data_path, "rb") as orig_gzip, \ - gzip.GzipFile(original_data_path, "wb") as modified_gzip: - data = bytearray(orig_gzip.read()) - data[len(data)-1] = 37 - modified_gzip.write(data) - - # should fail checksum validation - if as_frame: - pytest.importorskip('pandas') - with pytest.raises(ValueError) as exc: - sklearn.datasets.fetch_openml(data_id=data_id, cache=cache, - as_frame=as_frame) - # exception message should have file-path - assert exc.match("1666876") - finally: - shutil.copy(backup_data_path, original_data_path) - os.remove(backup_data_path) + corrupt_copy = original_data_path + ".test_corrupt_arff" + with gzip.GzipFile(original_data_path, "rb") as orig_gzip, \ + gzip.GzipFile(corrupt_copy, "wb") as modified_gzip: + data = bytearray(orig_gzip.read()) + data[len(data)-1] = 37 + modified_gzip.write(data) + + # simulate request to return modified file + mocked_openml_url = sklearn.datasets._openml.urlopen + + def swap_file_mock(request): + url = request.get_full_url() + if url.endswith('data/v1/download/1666876'): + return MockHTTPResponse(open(corrupt_copy, "rb"), is_gzip=True) + else: + return mocked_openml_url(request) + + monkeypatch.setattr(sklearn.datasets._openml, 'urlopen', swap_file_mock) + + # validate failed checksum + if as_frame: + pytest.importorskip('pandas') + with pytest.raises(ValueError) as exc: + sklearn.datasets.fetch_openml(data_id=data_id, cache=False, + as_frame=as_frame) + # exception message should have file-path + assert exc.match("1666876") + + # cleanup fake local file + os.remove(corrupt_copy) From d075a830cfa6b16e704a7c92823a78f2fbaa36fa Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sat, 6 Jun 2020 21:53:33 -0400 Subject: [PATCH 18/25] Update sklearn/datasets/tests/test_openml.py flake8 Co-authored-by: Thomas J. Fan --- sklearn/datasets/tests/test_openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 64dafdded2661..0e3e63925c139 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1204,7 +1204,7 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache): 'data-v1-download-1666876.arff.gz') corrupt_copy = original_data_path + ".test_corrupt_arff" with gzip.GzipFile(original_data_path, "rb") as orig_gzip, \ - gzip.GzipFile(corrupt_copy, "wb") as modified_gzip: + gzip.GzipFile(corrupt_copy, "wb") as modified_gzip: data = bytearray(orig_gzip.read()) data[len(data)-1] = 37 modified_gzip.write(data) From 23ba1905e2ab37c0321e5e3f3ff7f809870047b8 Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sat, 6 Jun 2020 22:39:48 -0400 Subject: [PATCH 19/25] Use tmpdir for creating corrupt file, add comments, update changelog --- doc/whats_new/v0.24.rst | 8 +++++--- sklearn/datasets/tests/test_openml.py | 10 ++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index dd4ab30a7f2ff..6a968d51871a5 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -44,9 +44,11 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. -:mod:`sklearn.module` -..................... - +:mod:`sklearn.datasets` +....................... +- |Feature| :func:`datasets.fetch_openml` now validates md5checksum of arff + files downloaded or cached to ensure data integrity. + :pr:`14800` by :user:`Shashank Singh ` and `Joel Nothman`_. Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 64dafdded2661..a55c0a70f1d59 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1194,22 +1194,24 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response): @pytest.mark.parametrize('as_frame', [True, False]) -def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache): +def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir): data_id = 2 _monkey_patch_webbased_functions(monkeypatch, data_id, True) - # create a modified (local) arff file + # create a temporary modified arff file dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id)) original_data_path = os.path.join(dataset_dir, 'data-v1-download-1666876.arff.gz') - corrupt_copy = original_data_path + ".test_corrupt_arff" + corrupt_copy = tmpdir + "/test_invalid_checksum.arff" with gzip.GzipFile(original_data_path, "rb") as orig_gzip, \ gzip.GzipFile(corrupt_copy, "wb") as modified_gzip: data = bytearray(orig_gzip.read()) data[len(data)-1] = 37 modified_gzip.write(data) - # simulate request to return modified file + # Requests are already mocked by monkey_patch_webbased_functions. + # We want to re-use that mock for all requests except file download, + # hence creating a thin mock over the original mock mocked_openml_url = sklearn.datasets._openml.urlopen def swap_file_mock(request): From c2161712b00dcf0181befee88402448bf87922ee Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sat, 6 Jun 2020 23:15:03 -0400 Subject: [PATCH 20/25] Make test-path platform independent --- sklearn/datasets/tests/test_openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 5a880811771bf..1d1341989f740 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1202,7 +1202,7 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir): dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id)) original_data_path = os.path.join(dataset_dir, 'data-v1-download-1666876.arff.gz') - corrupt_copy = tmpdir + "/test_invalid_checksum.arff" + corrupt_copy = os.path.join(tmpdir, "test_invalid_checksum.arff") with gzip.GzipFile(original_data_path, "rb") as orig_gzip, \ gzip.GzipFile(corrupt_copy, "wb") as modified_gzip: data = bytearray(orig_gzip.read()) From 536bc4f683518ca78c2998ce22c46bb3a71c742a Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sun, 7 Jun 2020 10:40:51 -0400 Subject: [PATCH 21/25] Do not remove file explicitly from tmpdir --- sklearn/datasets/tests/test_openml.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 1d1341989f740..b42551ba2fdfe 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1231,6 +1231,3 @@ def swap_file_mock(request): as_frame=as_frame) # exception message should have file-path assert exc.match("1666876") - - # cleanup fake local file - os.remove(corrupt_copy) From c55f64a2af78d953de8dfa00f893bbfecc28b120 Mon Sep 17 00:00:00 2001 From: Shashank Singh Date: Sun, 7 Jun 2020 18:17:34 -0400 Subject: [PATCH 22/25] Make test mock class private to ignore coverage --- sklearn/datasets/tests/test_openml.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index b42551ba2fdfe..04e65c960e4c2 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -146,7 +146,7 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, return data_by_id -class MockHTTPResponse: +class _MockHTTPResponse: def __init__(self, data, is_gzip): self.data = data self.is_gzip = is_gzip @@ -154,12 +154,6 @@ def __init__(self, data, is_gzip): def read(self, amt=-1): return self.data.read(amt) - def tell(self): - return self.data.tell() - - def seek(self, pos, whence=0): - return self.data.seek(pos, whence) - def close(self): self.data.close() @@ -204,10 +198,10 @@ def _mock_urlopen_data_description(url, has_gzip_header): if has_gzip_header and gzip_response: fp = open(path, 'rb') - return MockHTTPResponse(fp, True) + return _MockHTTPResponse(fp, True) else: fp = read_fn(path, 'rb') - return MockHTTPResponse(fp, False) + return _MockHTTPResponse(fp, False) def _mock_urlopen_data_features(url, has_gzip_header): assert url.startswith(url_prefix_data_features) @@ -215,10 +209,10 @@ def _mock_urlopen_data_features(url, has_gzip_header): _file_name(url, '.json')) if has_gzip_header and gzip_response: fp = open(path, 'rb') - return MockHTTPResponse(fp, True) + return _MockHTTPResponse(fp, True) else: fp = read_fn(path, 'rb') - return MockHTTPResponse(fp, False) + return _MockHTTPResponse(fp, False) def _mock_urlopen_download_data(url, has_gzip_header): assert (url.startswith(url_prefix_download_data)) @@ -228,10 +222,10 @@ def _mock_urlopen_download_data(url, has_gzip_header): if has_gzip_header and gzip_response: fp = open(path, 'rb') - return MockHTTPResponse(fp, True) + return _MockHTTPResponse(fp, True) else: fp = read_fn(path, 'rb') - return MockHTTPResponse(fp, False) + return _MockHTTPResponse(fp, False) def _mock_urlopen_data_list(url, has_gzip_header): assert url.startswith(url_prefix_data_list) @@ -248,10 +242,10 @@ def _mock_urlopen_data_list(url, has_gzip_header): if has_gzip_header: fp = open(json_file_path, 'rb') - return MockHTTPResponse(fp, True) + return _MockHTTPResponse(fp, True) else: fp = read_fn(json_file_path, 'rb') - return MockHTTPResponse(fp, False) + return _MockHTTPResponse(fp, False) def _mock_urlopen(request): url = request.get_full_url() @@ -1217,7 +1211,7 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir): def swap_file_mock(request): url = request.get_full_url() if url.endswith('data/v1/download/1666876'): - return MockHTTPResponse(open(corrupt_copy, "rb"), is_gzip=True) + return _MockHTTPResponse(open(corrupt_copy, "rb"), is_gzip=True) else: return mocked_openml_url(request) From fc9181dfe73285921841f488e13e84032c3142df Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 25 Jun 2020 21:38:26 +0200 Subject: [PATCH 23/25] Fix merge conflict issues --- sklearn/datasets/_openml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 58d199117919c..d2ce5d471f8f7 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -488,6 +488,7 @@ def _get_num_samples(data_qualities: OpenmlQualitiesType) -> int: qualities = {d['name']: d['value'] for d in data_qualities} return int(float(qualities.get('NumberOfInstances', default_n_samples))) + def _load_arff_response( url: str, data_home: Optional[str], @@ -515,7 +516,7 @@ def _stream_checksum_generator(response): encode_nominal=encode_nominal) parsed_arff = parse_arff(arff) - + # consume remaining stream, if early exited for _ in stream: pass @@ -538,7 +539,8 @@ def _download_data_to_bunch( features_list: List, data_columns: List[int], target_columns: List, - shape: Optional[Tuple[int, int]] + shape: Optional[Tuple[int, int]], + md5_checksum: str ): """Download OpenML ARFF and convert to Bunch of data """ From 07a53b6064753c1ac0a1a0f3881f454458409573 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 25 Jun 2020 21:43:12 +0200 Subject: [PATCH 24/25] fmt --- sklearn/datasets/tests/test_openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 8157f96eee769..687c22cdbfd37 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1265,4 +1265,4 @@ def test_convert_arff_data_type(): } msg = r"arff\['data'\] must be a generator when converting to pd.DataFrame" with pytest.raises(ValueError, match=msg): - _convert_arff_data_dataframe(arff, ['a'], {}) \ No newline at end of file + _convert_arff_data_dataframe(arff, ['a'], {}) From d5bebcf7a2d708905f31a83607817ad12925a8d9 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 25 Jun 2020 20:03:49 -0400 Subject: [PATCH 25/25] CLN Early skip if pandas is not avaliable --- sklearn/datasets/tests/test_openml.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 687c22cdbfd37..4e1220807947f 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1206,6 +1206,9 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response): @pytest.mark.parametrize('as_frame', [True, False]) def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir): + if as_frame: + pytest.importorskip('pandas') + data_id = 2 _monkey_patch_webbased_functions(monkeypatch, data_id, True) @@ -1235,8 +1238,6 @@ def swap_file_mock(request): monkeypatch.setattr(sklearn.datasets._openml, 'urlopen', swap_file_mock) # validate failed checksum - if as_frame: - pytest.importorskip('pandas') with pytest.raises(ValueError) as exc: sklearn.datasets.fetch_openml(data_id=data_id, cache=False, as_frame=as_frame)