From 5e9f6c94995aa7fdc4eaf1efeb5b73bcb73fde77 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sat, 24 Aug 2019 16:56:45 -0400
Subject: [PATCH 01/25] add verify_checksum functionality with tests

---
 sklearn/datasets/openml.py            | 49 ++++++++++++--
 sklearn/datasets/tests/test_openml.py | 93 ++++++++++++++++++++-------
 2 files changed, 112 insertions(+), 30 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index d0197537949d9..826e0b57977e4 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -1,6 +1,8 @@
 import gzip
 import json
 import os
+from io import BytesIO
+import hashlib
 import shutil
 from os.path import join
 from warnings import warn
@@ -61,7 +63,7 @@ def wrapper():
     return decorator
 
 
-def _open_openml_url(openml_path, data_home):
+def _open_openml_url(openml_path, data_home, expected_md5_checksum=None):
     """
     Returns a resource from OpenML.org. Caches it to data_home if required.
 
@@ -86,10 +88,29 @@ def is_gzip(_fsrc):
     req = Request(_OPENML_PREFIX + openml_path)
     req.add_header('Accept-encoding', 'gzip')
 
+    def _md5_validated_stream(input_stream, md5_checksum):
+        """
+        Consume binary stream to validate checksum,
+        return a new stream with same content
+        :param input_stream: Stream to read bytes from
+        :param md5_checksum: Expected md5 checksum
+        :return: Stream with the original content for consumption
+        """
+        with closing(input_stream):
+            bytes_content = input_stream.read()
+            actual_md5_checksum = hashlib.md5(bytes_content).hexdigest()
+            if md5_checksum != actual_md5_checksum:
+                raise ValueError(f"md5checksum {actual_md5_checksum} \
+                does not match {md5_checksum}")
+            return BytesIO(bytes_content)
+
     if data_home is None:
         fsrc = urlopen(req)
         if is_gzip(fsrc):
-            return gzip.GzipFile(fileobj=fsrc, mode='rb')
+            fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
+        if expected_md5_checksum:
+            # validating checksum reads and consumes the stream
+            return _md5_validated_stream(fsrc, expected_md5_checksum)
         return fsrc
 
     local_path = _get_local_path(openml_path, data_home)
@@ -102,6 +123,10 @@ def is_gzip(_fsrc):
 
         try:
             with closing(urlopen(req)) as fsrc:
+                if expected_md5_checksum:
+                    if is_gzip(fsrc):
+                        fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
+                    fsrc = _md5_validated_stream(fsrc, expected_md5_checksum)
                 if is_gzip(fsrc):
                     with open(local_path, 'wb') as fdst:
                         shutil.copyfileobj(fsrc, fdst)
@@ -448,7 +473,8 @@ def _get_num_samples(data_qualities):
     return int(float(qualities.get('NumberOfInstances', default_n_samples)))
 
 
-def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
+def _download_data_arff(file_id, sparse, data_home, encode_nominal=True,
+                        expected_md5_checksum=None):
     # Accesses an ARFF file on the OpenML server. Documentation:
     # https://www.openml.org/api_data_docs#!/data/get_download_id
     # encode_nominal argument is to ensure unit testing, do not alter in
@@ -457,7 +483,8 @@ def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
 
     @_retry_with_clean_cache(url, data_home)
     def _arff_load():
-        with closing(_open_openml_url(url, data_home)) as response:
+        with closing(_open_openml_url(url, data_home, expected_md5_checksum)) \
+                as response:
             if sparse is True:
                 return_type = _arff.COO
             else:
@@ -515,7 +542,7 @@ def _valid_data_column_names(features_list, target_columns):
 
 def fetch_openml(name=None, version='active', data_id=None, data_home=None,
                  target_column='default-target', cache=True, return_X_y=False,
-                 as_frame=False):
+                 as_frame=False, verify_checksum=True):
     """Fetch dataset from openml by name or dataset id.
 
     Datasets are uniquely identified by either an integer ID or by a
@@ -576,6 +603,11 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
         data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas
         DataFrames or Series as describe above.
 
+    verify_checksum : boolean, default=True
+        If True, verifies md5_checksum of file provided in /download/{id}
+        If cache=True, verification only happens during data download
+        from network.
+
     Returns
     -------
 
@@ -727,8 +759,11 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
         shape = None
 
     # obtain the data
-    arff = _download_data_arff(data_description['file_id'], return_sparse,
-                               data_home, encode_nominal=not as_frame)
+    arff = _download_data_arff(data_description['file_id'],
+                               return_sparse,
+                               data_home,
+                               encode_nominal=not as_frame,
+                               expected_md5_checksum=data_description["md5_checksum"] if verify_checksum else None)  # noqa: E501
 
     description = "{}\n\nDownloaded from openml.org.".format(
         data_description.pop('description'))
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 43e8c97848958..8e7bb14bfc32f 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -8,6 +8,7 @@
 import scipy.sparse
 import sklearn
 import pytest
+import shutil
 
 from sklearn import config_context
 from sklearn.datasets import fetch_openml
@@ -48,7 +49,8 @@ def decode_column(data_bunch, col_idx):
             # non-nominal attribute
             return data_bunch.data[:, col_idx]
 
-    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None)
+    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None,
+                              verify_checksum=False)
 
     # also obtain decoded arff
     data_description = _get_data_description_by_id(data_id, None)
@@ -79,18 +81,19 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
     # result. Note that this function can be mocked (by invoking
     # _monkey_patch_webbased_functions before invoking this function)
     data_by_name_id = fetch_openml(name=data_name, version=data_version,
-                                   cache=False)
+                                   cache=False, verify_checksum=False)
     assert int(data_by_name_id.details['id']) == data_id
 
     # Please note that cache=False is crucial, as the monkey patched files are
     # not consistent with reality
-    fetch_openml(name=data_name, cache=False)
+    fetch_openml(name=data_name, cache=False, verify_checksum=False)
     # without specifying the version, there is no guarantee that the data id
     # will be the same
 
     # fetch with dataset id
     data_by_id = fetch_openml(data_id=data_id, cache=False,
-                              target_column=target_column)
+                              target_column=target_column,
+                              verify_checksum=False)
     assert data_by_id.details['name'] == data_name
     assert data_by_id.data.shape == (expected_observations, expected_features)
     if isinstance(target_column, str):
@@ -115,7 +118,8 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
 
     if compare_default_target:
         # check whether the data by id and data by id target are equal
-        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
+        data_by_id_default = fetch_openml(data_id=data_id, cache=False,
+                                          verify_checksum=False)
         if data_by_id.data.dtype == np.float64:
             np.testing.assert_allclose(data_by_id.data,
                                        data_by_id_default.data)
@@ -137,7 +141,7 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
 
     # test return_X_y option
     fetch_func = partial(fetch_openml, data_id=data_id, cache=False,
-                         target_column=target_column)
+                         target_column=target_column, verify_checksum=False)
     check_return_X_y(data_by_id, fetch_func)
     return data_by_id
 
@@ -399,7 +403,8 @@ def test_fetch_openml_anneal_pandas(monkeypatch):
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
 
     bunch = fetch_openml(data_id=data_id, as_frame=True,
-                         target_column=target_column, cache=False)
+                         target_column=target_column, cache=False,
+                         verify_checksum=False)
     data = bunch.data
     target = bunch.target
     frame = bunch.frame
@@ -482,7 +487,8 @@ def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch):
     msg = 'Could not adhere to working_memory config.'
     with pytest.warns(UserWarning, match=msg):
         with config_context(working_memory=1e-6):
-            fetch_openml(data_id=data_id, as_frame=True, cache=False)
+            fetch_openml(data_id=data_id, as_frame=True, cache=False,
+                         verify_checksum=False)
 
 
 def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch):
@@ -499,7 +505,7 @@ def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch):
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
     X, y = fetch_openml(data_id=data_id, as_frame=True, cache=False,
-                        return_X_y=True)
+                        return_X_y=True, verify_checksum=False)
     assert isinstance(X, pd.DataFrame)
     assert X.shape == data_shape
     n_categories = len([dtype for dtype in X.dtypes
@@ -528,7 +534,8 @@ def test_fetch_openml_adultcensus_pandas(monkeypatch):
     target_column = 'class'
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
-    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
+    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False,
+                         verify_checksum=False)
     data = bunch.data
     target = bunch.target
     frame = bunch.frame
@@ -566,7 +573,8 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch):
     frame_n_floats = 77
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
-    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
+    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False,
+                         verify_checksum=False)
     data = bunch.data
     target = bunch.target
     frame = bunch.frame
@@ -606,7 +614,7 @@ def test_fetch_openml_emotions_pandas(monkeypatch):
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
     bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False,
-                         target_column=target_column)
+                         target_column=target_column, verify_checksum=False)
     data = bunch.data
     target = bunch.target
     frame = bunch.frame
@@ -997,7 +1005,8 @@ def _mock_urlopen_raise(request):
         monkeypatch, data_id, gzip_response)
     X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True,
                                         data_home=cache_directory,
-                                        return_X_y=True)
+                                        return_X_y=True,
+                                        verify_checksum=False)
 
     monkeypatch.setattr(sklearn.datasets.openml, 'urlopen',
                         _mock_urlopen_raise)
@@ -1030,12 +1039,13 @@ def test_fetch_openml_inactive(monkeypatch, gzip_response):
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     glas2 = assert_warns_message(
         UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
-        data_id=data_id, cache=False)
+        data_id=data_id, cache=False, verify_checksum=False)
     # fetch inactive dataset by name and version
     assert glas2.data.shape == (163, 9)
     glas2_by_version = assert_warns_message(
         UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
-        data_id=None, name="glass2", version=1, cache=False)
+        data_id=None, name="glass2", version=1, cache=False,
+        verify_checksum=False)
     assert int(glas2_by_version.details['id']) == data_id
 
 
@@ -1046,7 +1056,8 @@ def test_fetch_nonexiting(monkeypatch, gzip_response):
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # Note that we only want to search by name (not data id)
     assert_raise_message(ValueError, "No active dataset glass2 found",
-                         fetch_openml, name='glass2', cache=False)
+                         fetch_openml, name='glass2', cache=False,
+                         verify_checksum=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1058,7 +1069,8 @@ def test_raises_illegal_multitarget(monkeypatch, gzip_response):
     assert_raise_message(ValueError,
                          "Can only handle homogeneous multi-target datasets,",
                          fetch_openml, data_id=data_id,
-                         target_column=targets, cache=False)
+                         target_column=targets, cache=False,
+                         verify_checksum=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1071,20 +1083,20 @@ def test_warn_ignore_attribute(monkeypatch, gzip_response):
     assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
                          fetch_openml, data_id=data_id,
                          target_column='MouseID',
-                         cache=False)
+                         cache=False, verify_checksum=False)
     assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
                          fetch_openml, data_id=data_id,
                          target_column='Genotype',
-                         cache=False)
+                         cache=False, verify_checksum=False)
     # multi column test
     assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
                          fetch_openml, data_id=data_id,
                          target_column=['MouseID', 'class'],
-                         cache=False)
+                         cache=False, verify_checksum=False)
     assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
                          fetch_openml, data_id=data_id,
                          target_column=['Genotype', 'class'],
-                         cache=False)
+                         cache=False, verify_checksum=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1095,7 +1107,8 @@ def test_string_attribute_without_dataframe(monkeypatch, gzip_response):
     assert_raise_message(ValueError,
                          ('STRING attributes are not supported for '
                           'array representation. Try as_frame=True'),
-                         fetch_openml, data_id=data_id, cache=False)
+                         fetch_openml, data_id=data_id, cache=False,
+                         verify_checksum=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1106,7 +1119,7 @@ def test_dataset_with_openml_error(monkeypatch, gzip_response):
         UserWarning,
         "OpenML registered a problem with the dataset. It might be unusable. "
         "Error:",
-        fetch_openml, data_id=data_id, cache=False
+        fetch_openml, data_id=data_id, cache=False, verify_checksum=False
     )
 
 
@@ -1173,3 +1186,37 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
     # so we assert that we don't have the ignored feature in the final Bunch
     assert dataset['data'].shape == (101, 16)
     assert 'animal' not in dataset['feature_names']
+
+
+def test_fetch_openml_verify_checksum(monkeypatch):
+    data_id = 2
+    _monkey_patch_webbased_functions(monkeypatch, data_id, True)
+
+    # modify a mocked file content to change checksum
+    dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id))
+    original_data_path = os.path.join(dataset_dir,
+                                      'data-v1-download-1666876.arff.gz')
+    backup_data_path = original_data_path + ".back"
+    shutil.copy(original_data_path, backup_data_path)
+
+    try:
+        with gzip.GzipFile(backup_data_path, "rb") as orig_gzip, \
+             gzip.GzipFile(original_data_path, "wb") as modified_gzip:
+            data = bytearray(orig_gzip.read())
+            data[1] = 37
+            modified_gzip.write(data)
+
+        # succeeds if checksum validation is explicitly overriden
+        dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
+                                                verify_checksum=False)
+        assert dataset
+
+        # fails with checksum validation (default)
+        with pytest.raises(ValueError) as exc:
+            sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
+                                          verify_checksum=True)
+
+        assert exc
+    finally:
+        shutil.copy(backup_data_path, original_data_path)
+        os.remove(backup_data_path)

From 5df30c063585ab84d8bf89d7795483e685e1b8ef Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sat, 24 Aug 2019 17:28:58 -0400
Subject: [PATCH 02/25] python3.5 compatible multi-line string

---
 sklearn/datasets/openml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 826e0b57977e4..d5e02728c845c 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -100,8 +100,8 @@ def _md5_validated_stream(input_stream, md5_checksum):
             bytes_content = input_stream.read()
             actual_md5_checksum = hashlib.md5(bytes_content).hexdigest()
             if md5_checksum != actual_md5_checksum:
-                raise ValueError(f"md5checksum {actual_md5_checksum} \
-                does not match {md5_checksum}")
+                raise ValueError(f"md5checksum {actual_md5_checksum} "
+                                 "does not match {md5_checksum}")
             return BytesIO(bytes_content)
 
     if data_home is None:

From df4c0493c4f1bb89c373035d0311eba908db5223 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sat, 24 Aug 2019 22:09:56 -0400
Subject: [PATCH 03/25] use titanic local file and format string

---
 sklearn/datasets/openml.py            | 32 ++++++++++++++++-----------
 sklearn/datasets/tests/test_openml.py | 10 ++++++---
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index d5e02728c845c..a56d1d00e6072 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -92,16 +92,26 @@ def _md5_validated_stream(input_stream, md5_checksum):
         """
         Consume binary stream to validate checksum,
         return a new stream with same content
-        :param input_stream: Stream to read bytes from
-        :param md5_checksum: Expected md5 checksum
-        :return: Stream with the original content for consumption
+
+        Parameters
+        ----------
+        input_stream : io.BufferedIOBase
+            Input stream with a read() method to get content in bytes
+
+        md5_checksum: str
+            Expected md5 checksum
+
+        Returns
+        -------
+        BytesIO stream with the same content as input_stream for consumption
         """
         with closing(input_stream):
             bytes_content = input_stream.read()
             actual_md5_checksum = hashlib.md5(bytes_content).hexdigest()
             if md5_checksum != actual_md5_checksum:
-                raise ValueError(f"md5checksum {actual_md5_checksum} "
-                                 "does not match {md5_checksum}")
+                raise ValueError("md5checksum: {} does not match expected: "
+                                 "{}".format(actual_md5_checksum,
+                                             md5_checksum))
             return BytesIO(bytes_content)
 
     if data_home is None:
@@ -123,16 +133,12 @@ def _md5_validated_stream(input_stream, md5_checksum):
 
         try:
             with closing(urlopen(req)) as fsrc:
+                if is_gzip(fsrc):   # unzip it for checksum validation
+                    fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
                 if expected_md5_checksum:
-                    if is_gzip(fsrc):
-                        fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
                     fsrc = _md5_validated_stream(fsrc, expected_md5_checksum)
-                if is_gzip(fsrc):
-                    with open(local_path, 'wb') as fdst:
-                        shutil.copyfileobj(fsrc, fdst)
-                else:
-                    with gzip.GzipFile(local_path, 'wb') as fdst:
-                        shutil.copyfileobj(fsrc, fdst)
+                with gzip.GzipFile(local_path, 'wb') as fdst:
+                    shutil.copyfileobj(fsrc, fdst)
         except Exception:
             if os.path.exists(local_path):
                 os.unlink(local_path)
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 8e7bb14bfc32f..14f236929bb51 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1189,13 +1189,15 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
 
 
 def test_fetch_openml_verify_checksum(monkeypatch):
-    data_id = 2
+    # locally, titanic dataset is not trimmed, hence a valid
+    # candidate for a checksum test failure
+    data_id = 40945
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
 
     # modify a mocked file content to change checksum
     dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id))
     original_data_path = os.path.join(dataset_dir,
-                                      'data-v1-download-1666876.arff.gz')
+                                      'data-v1-download-16826755.arff.gz')
     backup_data_path = original_data_path + ".back"
     shutil.copy(original_data_path, backup_data_path)
 
@@ -1203,17 +1205,19 @@ def test_fetch_openml_verify_checksum(monkeypatch):
         with gzip.GzipFile(backup_data_path, "rb") as orig_gzip, \
              gzip.GzipFile(original_data_path, "wb") as modified_gzip:
             data = bytearray(orig_gzip.read())
-            data[1] = 37
+            data[len(data)-1] = 37
             modified_gzip.write(data)
 
         # succeeds if checksum validation is explicitly overriden
         dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
+                                                as_frame=True,
                                                 verify_checksum=False)
         assert dataset
 
         # fails with checksum validation (default)
         with pytest.raises(ValueError) as exc:
             sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
+                                          as_frame=True,
                                           verify_checksum=True)
 
         assert exc

From f4ca32b5d824330edbca3a3706d0835f9a180075 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sun, 25 Aug 2019 18:55:15 -0400
Subject: [PATCH 04/25] update locally truncated arff md5sums

---
 .../openml/1119/api-v1-json-data-1119.json.gz | Bin 707 -> 711 bytes
 .../data/openml/2/api-v1-json-data-2.json.gz  | Bin 1362 -> 1363 bytes
 .../openml/292/api-v1-json-data-292.json.gz   | Bin 547 -> 547 bytes
 .../40589/api-v1-json-data-40589.json.gz      | Bin 596 -> 598 bytes
 .../40675/api-v1-json-data-40675.json.gz      | Bin 323 -> 323 bytes
 .../40966/api-v1-json-data-40966.json.gz      | Bin 1659 -> 1660 bytes
 sklearn/datasets/tests/test_openml.py         |   7 ++-----
 7 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-1119.json.gz b/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-1119.json.gz
index 97ab0d1ce64f686215c763732e3c303fcfae22db..286e57672339a1ad99633afd2ea918d62c8c13b3 100644
GIT binary patch
delta 694
zcmV;n0!jVD1;+&lABzYGDi32_kq94uO-~~+488BKu;v7#e$14%v~vS0v}hNE3Y-{J
zrHQAFcAAV5PrD<;f5!>kT{!K{cI;>S<@q&Hg2c)rtrU9cbVJ&jq<BrVN{S@U^Lf%s
zjMyQ2LbWyZB^uv)j_%R<n4IM7eJM}U^rEG$b!l3_(i+pj54#n64Z<k66wY;jaEbC4
zI)Kuo>4R;Z#E|@a`*XQ`IoT35ML8^o;<q&e=?Cks9iVE9NtR{h@D$*q@V-Xj5u_$O
zwmLTIL76;AQERh-rv_+?a3N*ugzUgpa3}O0`<J3a4_CGDURRpeDL}vir4D=b9~~lm
z@77MMGU>s+vl3A;pbrQE@d2fO<$Z1pp)(?^E^4^4rb35CB7|v-J=CZ+2n+3{WzM?D
z!GdXrTWuJw2l;$H4f~WPg!wa|a&{M61UkTBW4$Kpx&r>gOaMvKJ4;w_z4oAvh-_+w
zhBT7`WFz?T2f;HmD$oX=u5Nm;Ziz>)I}XUY<&TZY<U|}dBKE*o`Us<crx+SA9SF}+
zByci1{q%{pL9?p|)@{n6Du?-Go}EHvT?CMJ*VIV(0g|SyC4SloVi{d5udmr}o7xJs
z;)Ec>QI?PUdDhR)AG53&=fxywwAM0k&hPi_nw?PQYIegEK8|rX90p%T;fCdKpR;8T
zCbTGrlW~@fnA}fN9|%`}RbeV!<0_s-Op2PX1m~?txD8*BAaa_?3q!~tDo%0sz%^f<
zZWbITu}O+&MsO@d#erQ%+jZ-S8D52Z^3<yCvo-l^j<V#1&yQ8Q*0rXNF`GX-dlJOg
z$;T>*kvY-?-Lk+L_aA|?NZ=Wa@?tbE&c<N?HCf3mAEj@1oHSJ>W1-GwqseTRRnxIJ
c$7*`6GC5yov+;}xd3*cz4je#EYCZ%20P|B&1poj5

delta 690
zcmV;j0!{tL1;YghABzYG4$-??kq94uQBUJA4E`%tpP<^-c3Xk<4VciOI}jT1MA4Kc
z?iO8_REfJ?h4}C1l-(UX-J9*$pY1Q7-y$VQtUc0Np_fj#q^*gHw@9m~h_WnOM5D-v
z9kM4>*U(s^@tx=B9-SYOql~>T<xw18cC@uFjtf{>V|w^*w_>kB7zJ0txgM^6Q2s;*
zP?|V?w4IX}l3(tAtX8ilTcWlolQI#%teHqZsrqh$sw?s|P0QpI;G^)NLE#akCOoz}
zHX1=0JxbANvw`OpXp3+uW#@$K!Pamu^d84oqDK$cjqqOAn%5~nz!IendyStQB7EyB
zr&Sq^;K5mms2I=(gn;;n((*okw}#Le5mpxsTw7D4Ln9HwG{znpR2zh)_R=zE{lLMJ
zX@@&)7;XgFVsRGsDNP9TXF%=jZfFtc0Eeyhnyl*!_zyDyB#!SbVZrs<gE}IzsTCU1
zObU=4z>hx&o}p2JHt>9XJA$es9=+)~Ae)Xqc1R{ChH(eP5g1D!Ve~YAgeFV}!pk5M
zIGLV)`b68J*)<dEHf5s9B+D1+Db&^t0MhQ-1_?hv;+VC>Pdh;@qs!IJ4f|c&SfSRO
z5M($_v-vnn$LaY~nilh{$b&{3Ed%HL{?JwIgfiE#8>aBX7>C1Q@?{imSSI_7EqgGb
zMVaLDG@UZJpQJt!uCBv>RJy_Sa27+-p!r&G-kOBl@C6AXr<uGkgbbqM6lV`y^X2Jw
z$#D{!sCZ!n$3j#b*!8rnI#0~-I^2_|PW7Ly{I5ApqgOsZR%xXhO+90_e|GjHh_926
zRT3j}qz$@dfivzu0xt#vXE4o*>7tm;!vHE@%Pk+J?{=JYb}`E#rz%&Xx~ONEXEn}c
Ybv`>2xu{VJ6}`Xz1{munjXeYa0PfdIEdT%j

diff --git a/sklearn/datasets/tests/data/openml/2/api-v1-json-data-2.json.gz b/sklearn/datasets/tests/data/openml/2/api-v1-json-data-2.json.gz
index 22dfb6ff61c1b2693bfe040418e754d7f586239e..921326973b21228f939cf767d06fffd1aae106b0 100644
GIT binary patch
literal 1363
zcmV-Z1+4lXiwFp13S(UW17UD!Ep{<2YIARHEo5PIVJ$K)YIARH0F_qXQ`<NYeulqd
zCw-tWvJ)pEG<gGBDAQ|?X`sE*bTYCm$2w%mC&@{3o&N8)vQ6T&<l#Kv$l6ceu0H+P
z{c(8ENvNZ9%iTQTmKjkvp;c!BKMoHLIwHYVr{5v|L(!c3PpTBBvhG`QV{3dTs(W@>
zIwvRRrOUKAIhnw{dQ^I)LU^qebv%K14X^1^BrtW{NTC)$RRW(4UBgAGoKPHKrcO?#
zx-^V?im&&VH{V`n&J}ht>(08=uuLp@D3}dPCPJQ+vu-YDT|X<Fl$~|Yt8kJ07y^jK
z<6g~qD>(vy3HSeqaRBv`n-^LNwhl9w%VSSDQ+F8$ZNx$-Dq-UmnouA$ItO$Uf(P>N
zlsU*bRUV^LaM+o0>Po|Hr2~`HT5Jmol9~mVYbc)-+^`c63RxVLvNVfzH84kUjiAzk
zWr8cliRYk@Av-vO@0=@aW%x-YVr}KT$ktXc4Bc~Bjie$+_blUi)m<1}aN`8_VYqbT
z2Aswqn5py>&O%Z$C!}z7GzO((wIQP>QQTUBV;}vr1uCEohm|&uG?qMs8;4v=EtlxU
z$6IZR9al8g%bLz`X4ckEUHn_E2{@i(h?R!khfogxYYJQ9!l8>@j1Lg!u7GLn<qYBF
zXYnWso(OvE*Yw4U7jW*J5pn4_+^AHWyz(`fDezAeLP~QX*Av)0K|roA$n`sNJtcP^
z$iq2#xFQd4$-{f{Fm1;5LocY8Dsbz9SEDZpyd`kmAO;~=$GL0~LqI-~b8=7Kk@sXu
zK9f%kb{GPqCdRO~Sf>3EDs^ca&}q0?fyrLbC~Op(8!f#g6Yuq|v7Uu~L?S8$HZe^x
zroS4*E7Zel6jwW7(jX$zCj&AhrwwZC=VrKCxNL7IQ7;6gRX`WK4x^^}&{$D;Tbh(I
z9;CR)))U>LEJ0)MZ|(n-v<@a9ugM#7MH-1Cl$iT#O}4`QE!YUfjahpY@cFXwgQ~{q
zUMtEOyS-MFGhX4n!rO?8)+A8PSneH;D(1f!x7I$&$1=Qg+K9`I73C{3ZFh#>ZmcLC
z!`+m$&0A)aEvIHFp0}c0{CnZ#^H!8A<;Ljets}p`jM<55YxQ!;lFzy^-m;mSb(hpA
zypr)MZ*TkyJS3mUcB&|%Ed6$yJV%Nm{#_cK=FeFND5Bjf@xY6?-|BZ@MvI-rz5Mg$
z4+7J-Z{9RVYJg(8Ot8l`>er(%KuuR7X&sj5Kq#%`YB!$WK&Y@5zig?1e0e%n1Fx~(
zkqV%P59yuy*T}!lM!g1&(iW6GR_C)hIJKooUK^UtnIiWdP=kV?N`*(k5iBV#EG8*T
z2ACJ|ykfRHQrgXFQDC0+H}?jFmv_dE&tIy!P5_Q_VJ(I+uQ5mPvd&_7Eh>n4o%XQg
zpqNbg0y9Y>uukEQZy0_j_h6gpl}{i(LDJazY+7f(r}6c;<FKxN>h$0XpYxAbS9M=m
zNKKPDmH}1y)bB;Zpof2>yZ&GjjVJwKjl*erdXKp+WBh7{8|k~YrO9fw3NaI_T!z{#
z>QV=fLWC`>4ZN>eHyVvbug*qT$5e$vNKKkH5emuYbzRqhp6a%gmeS3gFEvp1r<~`3
z)ne`xd~>`pxonkvjxO-)Ysb1WS<kT(o|?mLd(dv5ZQH{Z{;bz(k9-@;ug<sHxv-+X
zsJ-W+B@$KvuC!HkinYg**S&diI>+*bJ=!w&wpYD=G#c_#9<%X?r)*65uz%W*;uL>|
V<EV3Z@bmEJ;eWV95uO$c007BQoE888

literal 1362
zcmV-Y1+DrYiwFoC!BATO17u-zVP9lrb7OL8aCB*JZZ2wbZ*BmUR$X)2I23(Oe#IU7
zkR~G=48%!zOVZ|Jy6t94(r!~aj4TVRjVyU38OnD0-}lOPz_j6EJ>ZdbkIub1`ndPU
z{$3}gj!rFi)0A6gMB#*1oe})l-`nem6kDBshxiXgbM8N>Qk=@VZ^4bN@tvsd*=Fe+
z9i5jh)8^=C1P|&->7@$cjaJn06p|IZp$n11*l{C;ngdlSd^U6i7o~DSae#?BIvVTJ
zFzzY7K3v{@d!0E~*wLgr=~BZovEZR#HY}M4d0I}oxtw(UtZ-6x(mk)jMe;)kAUZqi
z)tq;dBLJ9i|Bo04P(Qh8p`~D}Fmt&)^pq2IpK;Je%!Q&7)^4E*1!hL)fNnzYK>nRF
z2RWz8WAqFhcFs6;rQx>Hfk|pDwuJ>r&78{>l+OyT*$D`RERIT9n)#|4n4`ExP-($3
z!4>1gb5O{T9h|{;&K0&Y{G<}GwsKx%D=Qd=?m4VRQjw#3mhrsmE{ra?aRU1=T)K7x
zPGb<vRC)?$AuX8`Qn)%AgVM3ukWrH=ZY{yFPk!1070`ymQky55NFKtiL#~CE3-sdS
ztv1D$E1Kv<O=mbWE9<8&{;k#o9M3VtN<;5MD2M-=!j`yj=wch=1H`#2U|M@QLwNmJ
zJc)v*f*$%cef8=UoI7VkQaTQ|YNkzI`5H|W_$LZsMsp!oBUnE{K&~&y^*eGsCifr6
z<2iY}B9AxZ@jZDQH{<%D7t~7?xK+Wc(U$~n2wXRaK?v4yE?dM9kdNe?Jdk(fJsFeF
z<WqwkhQO#vFsu!hX@7)DUD^h89IjVjv=bDEjY3nSrI%#nz1}z0lhBVyMWw(xrZL9!
zSA#f3J-kLqwF5>CA|ib<AVYH8pw9f<3|DiP?F=RAg`l(w=$zMK)KniDD++H*Goy?L
zGhAfriSAIApt1Ki_J2xR2NRGt<Sn@(jl>a3%>A_{TjBl|Y=q(_ti1~Od|CTJRg>9H
zE6SO;omP}HS>nCI+lY(SBv8#n?i`LP=D!#>);`L|GQ4xzh|9GV<ts96w}#)XttcMD
z-ITV?8)lR(r)D8uwxV48d*S5ER+MYTjnOY#W52(Q*@|jo^>WFQPr3=-vYDH77t|=c
zlJP2UZ~O~9B%jD;swko?{dOC@M2aH*T^c>hU$PERM7vYsffsSN)$hQJ7F&yZ`RDB)
z1jaXS-!?~TfMU8#u%|Za*P}2%O_w5V9hR3sD6QmbJD%S_sIV5lY^i{Jc|KMHud&{e
z3ZRD%=^gu5>|ZBwuR)`<1!Ygw`D_kOZE2F%nr3~b$b$#epdhGH;X!Z!3yKSiNeYtz
z=0!ZOnC%Xfc2imum}mXXy$0dsopIyymujvPfP-9Ei(yP_%mKWvvlw2B3KCwYJuEpW
zCR0AgOp*$$Q@G=6hTq9O*k*d^6NpccG_gLL*4giQd_C?ktgD|oJ@~@s{NvSC-B%V;
z({zeuKvh2Vd(klH;V-`L4@S}1s6VW6I89IQF?VHxU(IkMeb=TmSuU3$W@44gP@8#O
z>flj`u!XgO_ciH8aU7qX#8}5vg+fTpY}Q05B%juGT?2Zq+h(+sZt8rgfwDj6JP)iE
zbF1K+;*H5=tL$@hj$dCp)|KgMik<M>9B$f!cKc-09=7l&y;eK+Z7jb!-)iT=iu$7V
zo{N@9SOvJ!R@Eui9t&Rg=IQYi%NO=!%iP<BobvQEi8(t<Py12S>kpHd^-j-{UOYUB
Uqt5=`&;6hO0e4b}%oPg&0G9@#9smFU

diff --git a/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz b/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz
index 888140f92b36025bdaf00eba9eeb917fd9c3964e..3313b6fb4a0a26fd6c4561e80f51b93b4b7390b3 100644
GIT binary patch
literal 547
zcmV+;0^I!{iwFn<4r5&a17u-zVP9lrb7OL8aCB*JZZ2wbZ*BlpQrn8#Fc5uCzCyTf
zVH4Y0JB|I8mxV$LOS4@lbunXEj>XE7(Ip{W_TMX6X`O&EXJ(G(a%SF_i>L(ywWg+S
zsV0hzhDi~X=zY0ZM67iv!Uc{VyeVLy-m$mZDBui4@SmvC;Z>9d&wZ|Fwc1<LOSM{+
z=wQHbd932EtU0`XL#Ps~)j?WCs1N@TZ$h30S|gmK**Z<vX%;xooPwr^Ff$Rovl1^o
z)5v|Mr>KKLT8-xJke|(s_J9;MiqR8oZ?kqv4%0&(I?*okrV?KaI)g?+s=;OXL)(on
z6gvFhKC0rILY4T0WQPa`t*I^%zB+vU5^am*YIhWjRi2S%L8#7GF<zs?2ujm>|NZGH
zgotu%sixSyX4kmnoDwf_?dnX6nTHD$J8}u_eK^^z)BGjfmPu0LUEpy>s2~&P**4C)
zK3CkkdSgaiR&f=d&*zOCsTlZ1D%XymuZmBpLvCljRuyN(4&y4G#o6dE8K|zi4$U~N
z!);CMe>c;C<)${Ej_w@6xRo_FbpA6?3>=;HD;h4QZ(IQ;j<vfu6lr7I%N@*TjFTBZ
zW;4!i_&J{O=jeAhYo=MlI5QXD!gwQ$cKfi}ds_-_1t%wP8$v^y*QBT9P1|A8oB-aX
lK%gk_*8Lwb*+JfPT~p+F*Fdyfd@Mhfe*rBp!L5n|0001K2MhoJ

literal 547
zcmV+;0^I!{iwFp5lS^9w17u-zVP9lrb7OL8aCB*JZZ2wbZ*BlpQca87Fc3W_ze2b-
zvxzOmP7~j9StzuyG~0zz6(h@XELN6`J`%Re{`X2&S|^~xo0+GXHy`iIMbZ*WYQt>Z
zGD9^VEmtzB!25EsNO<c}g7zqR2&N<h3yy;`RujRAjQ%szCjLrb^xWr4()8eLuXLJL
z;An{z>V&heyg9yn0|3i3Ju0Uu3*jH~RjP{wD}ZuXtRP&$B66MtBZdLW?L-VtET4OB
zfPc1Upd*8F2F&U}=*_qG05r3jvomXNvvyhy(?=1N7@v8K<rfPs!~m)E;IsT;?K%{S
z3jfOoIJ+r?<tLyzKm{>|nF^q*Bh;_JcAsDEjuGp0V89{{stZ+Yn9*@0+OYcY{qZq|
zsP=57r`Y|>u5l#<qhZ9guQM%XZZ1;XC=_Y$!})Fn%V)T)^1MR3$P=70Nhi*eYrJ*+
zu0?S5){dsa8O|=3%UX?04q~memlOKK*%^A|cNS{ltk~}m!r45WwTYb(-F4TdxnOm?
zt%?2bW;)`8tu4_fe~!fZl{GFZ{~0NkoV@ib8QyH)_ySy>YJYJcG1j$jcd(o>%4fVO
zW?bCxOElxp$?tg9-0+4AZr_50@n$f_@57nkZ5i<^IQs#&F*LMgO?yUPjT-`?fn~Wb
lXonao_C?u1RKVSKvukKcQMrN1a`CbJ_zTS2bE$~~005Kz4Z;8b

diff --git a/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-40589.json.gz b/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-40589.json.gz
index 9c71553ce5137c44e095e6a40f9c3817f0130d31..97089490a18c9e01f6a30a0936dc49cea9bf18cb 100644
GIT binary patch
literal 598
zcmV-c0;&BUiwFpq4`W>b17u-zVP9lrb7OL8aCB*JZZ2wbZ*BlplgVz}FbsyzGfyG-
z)B<C9$&6>Y<<I~*1Srzp>cF5D+d^c^mE=sPLEgPu?4rFG5=H(bKhp2pO~SxJW4LV?
zH&lzs3N<Dd^uFC}62aUl+2@DTIoWx`2m|*fJ}B>JqV<hy6Qh#5X!&0&d77masRY6$
z^7&jkjP7S9)OtmqR>{wZy00+SAl0J7`3?=U5fmh9h2#cZh3!#=sGN=EnlrQnI){+J
zAQ%}smS}R1f?A~wMqhyT$f&VH6h=gNbalZ&O^qBWjRvK;&uM$UO1;AXzd1|ItS2oD
zXFJrxG~LokatK$=CHxX&m%0jv9r`&7Zc}50lsjaArP7JZ+s+l$>01h2r@1lWjep=s
zs#SUb>ml7gKRiU+JV^ztacBbUGB1v4UZzF$y(})p`BGNVPYTLM8hpOa#NC!~ExoC?
zb~2YbtF!C%iq*u&L1Lx5I`hoy?5%Jobsc@KI@?$KeQ_%5Y@GuJ-L^}O4t%rhLUdcF
z+=82IV?lRrfxucV$Q+A~;INq&ZUGx!v8~i_pcbIv2+c9W3$uU>M`Vsz%t?iXb*+4d
zp+R8=q02l!;c&tuw{<qb<o2@89=|<3KKsX}JRb1GmZmIpPUDBCTQ!4U&@l5xH^f2h
z2hgugkVtkY3GYpCNM7R33nPe-!rpu((?9r)D{wY}0kwbY9@wFwJ*Tf`9>R}8ik!fK
kmyDdCE%z-{6?OU~KM-;{lVrR3*nVvP00T^u<Dvrq00IgitN;K2

literal 596
zcmV-a0;~NWiwFqpnNwQ;17u-zVP9lrb7OL8aCB*JZZ2wbZ*Blpl1p#gAP~pTyPpF2
z)Jkjo$i8nmG*S+cs<gK{vM`Ko6l^HqY`3fZ?&V=u?FGR+{xke$ZtGRTh$XcFTQe|J
zb7i?4lQX)lSF40GcT2YU?s!Tz-Y}8@ya@*B<4m-EfHpBJd5D((wUU=fSe^>ffIz;U
zD~HkJ#JF6Z=+h$k8Bup~V-1om+MOQIFd0sXKrI(w(1qI`?GY7|v7mvWInf1#0tS(h
zp+kw3b8>2>HW+;&v`0pc9in7J4a88F9Mt3pNNO}l4Zf!BVUfC#f&2!Rnn^3o4X_RB
ziBi{82_WIAL6Bd3>{8d^ut7g34mLHG3$a0luv9p4aoxD4I(<z^*J&_@f506_Qmxbj
zu^!Un>(f)TO-e|@YKJDkF7x7$=4D#!zn8_iIGxLVj1!!~NJGrmsd2A4T?=pOtySh+
zWmR^$T(DFy4gyQvRhegAWgmsR$;;?#RoQmG-4@5P%9c4`qT6=v(Sfh$U5H`Jlw0B^
zTU(+#w?M>N&6_DU9l>EU&D;VuykJ|X;Xo}w!x5TehG%908IH&tu~?IGOP00rKMWt_
zX5hNa^CJ#7Jc6w<MU>miDtrF+^8D(Zb$Qrf#pX|0(m9PEp6<RL{DOv=*Sc;T)P5lP
zy$%w|1|`8?hk)cQ?mRcV;ey+%??nBB-?#=R6BtnYx9@@NYT5&RH`CyKhi%CYC2T2_
i(6qY~D;hYp#qO9NAU{1EllAJpzW)J{sq!tM0{{RV?IN83

diff --git a/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-40675.json.gz b/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-40675.json.gz
index 42b876f0a4723653c6cb004a8e54e432ec995450..6f21ade9d3ebaa6707a50cb26d442802a99a0dec 100644
GIT binary patch
literal 323
zcmV-J0lfYniwFq)4`W>b17u-zVP9lrb7OL8aCB*JZZ2wbZ*Blxkjrj^Fc3x8@D-KU
z1#I(3!>*frK-DD@8f-(XnuqKmsEYFM#W7v9yYZPjo_psWM??Zb>lC(9IbrpJdY(xA
za*v~kXt`IUpw+1)uO3V=Dvt&O&hfx*%DON^SRj9YI>gB`nlAMF*^Yp|_5N|W1hy__
z2HEazo(ni-r-T*>tv(sASzecU&>1b%B%YdO?e^C(ZjDD*#KqN3)@0kZrSqbu(WJ8-
zn#{*+vK!m0d7J#MCM(WW$xGT~!C?xv?+=ICsP;%7M8C;;SPixacGyn_h*o!Nuq^y1
z$4Ru)oi-Z3|NC?ZxH?az0)hIr=ar!~%CZ$#CBED`dI8XLnL~FLZ?s~SR7~a_6+Gtx
V3Qk2q;^+}S;vb?Mo>IgC003^+n{ogE

literal 323
zcmV-J0lfYniwFovwM$z717u-zVP9lrb7OL8aCB*JZZ2wbZ*Blxkh^ZfFbqX!`zs2~
zO)Oh}Al|y<1B#9Y3|fywi2Q&=$0&mQd!@)7+KoEABJbrRO^5)5-bw6*blmC%^*oXG
z?U5!4(ZW}xq;*q~_W&jsWk44L&J~f}rFHQS$s_qYX%`=s(G0P#W;+6g*2m}h9ND^<
z8AR{hycQHRZw|CP(E2Mc+MKl&i#nsZocK$#tb>2;->nJginzG8%erjaw&T3WX*9>#
zUb-yA?6Q0At9hHkt}ZK2byZZf%c8?0SfzG{+Q@!SA4R{(6s!h&1bgu(14OHbHCpCj
z$bJ&-^q`H#@Bcm>0<JDlA%Ua5`@AByMp5<rD*4T=<7*zJsxDcb51aw8ruHW(nG!>N
Vsv5-<Nt0*#`~j*sWxT@y006x3o!|ff

diff --git a/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-40966.json.gz b/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-40966.json.gz
index 02b25d717f925451397d3e7903768be470cc2f95..f1deb65592a9f3095fee9ee37f5b499e6b9d757d 100644
GIT binary patch
literal 1660
zcmV-?27~z@iwFpr5Mx~c17u-zVP9lrb7OL8aCB*JZZ2wbZ*Bn9S6fpXHxzz`Uvb8n
z$r!R;b`8cCo?H`B0yGS!?R4UdBk#(#!mi|zv=}Fo|K4+?HTDuZY0{@YFnFbN`Of|5
z%kEAsWtNQ%x8szXM2m$LO4fStWp`(%CQ|%WJBqr;$F+bCB+EG+U5SKWYh}5R{^Fi%
z;~8s5{-i3dw!hym>|E*n{T>W5rWyPyW(C(QfD2~lJmXx#+YsI`oeIe{9_!C5^VzB?
z9S?*u03*4-KU9TIIEi`x`{4XTeQxc-^hO7x1E!O?xaXlrOjsl$<Y_TF$g|M_nTQse
z(E;IpRRVwY`eC8DA=rD+fMyi8z^k~<IMPERtkY8hg+cI$4ldpX9@E8Js1xfR40$$f
zywfu##b+UBaK@P}G>0p;FmRr7X~lE}rE&uUEv!gb23El(XIc`Ekig(d6{!76r91<)
z{!}d`3|BJMD(4|wU#cOz`|UNvQT=$Y2Rw?K-MD!ij^s#gOoEKx^URQdb{iIJtvOnT
zpi<I=UWbHhKGBFGQQGncBJV(wF=M#nO_a1+Wx!<WftV5q<mgJ^4+~{b8rndV{K3K$
zZK01;Q0UR908K*w-$7AQI^l-?sYEb~Aw#g<0nUh9#6=+kj2THX!dM4}0Q2uyI65eQ
zL9S9UMGq4UgS^ZVQCcL3BITARmQ6AaM$9BbF{MLfkq{V|YL*bEcLMn)1O$fhWcYX3
zdGw6)1G(q$2&@)H;ekJ_R~)t%z*K4Sg|f^Aw%v4eA)L##X<UmO6U798$ymn99l(-Q
zYd%9?aEc%DbC%54b_244VfiBCnDdlCC0x|Wz2<Ge0V#a6<sohbfE2BER8bl@yNI`9
zScUoMkvXn1JN;0Wo>R^!0k`arqfV8@8BrP(!fP^b>lzX!VZwbzq|ONp<w=ZO$`@Qx
zg*|n_I-R(f&Fywx`>3G-zD-R~*o0AjV#KGua1EFO5ulD4m&&ddX!wNB*}b4JqGPN^
zLP8coKfxSHse&=TH{?#s2DWqmmN%d9t%V0*9t#XlI9_0HG*;wA#;mU)S1?d7i>_hj
zD!JQps1yaqct*qlb$8b&DLoD53#vtDZ>rHH;B<;rv97dK7c=yK&aiPwPCS|?3LCwH
ziv(;UGGSL6I8;K_M0w5-AwQuOG-E03e52_Z>^^=2AGIESeEj$k`!ghs!LWe`)r@||
zLBSJchGk?*z3W)^;89rq6T&vGsZ;822^<c-MMt6dX@n^78-)K-;6I_Xr4N0;RzL2;
z_ETZ`PYByW*R9C&F09J?4=%K&3x7)Bvmjk6`sTt!4#E(-SWkIVOlaYtQhFS{ZUVWM
zBdF!%LZVbG3A?1KaKqSVx=;HTFJ3_3BH&43IgSWy!?~MfJ-T)3<8oCvKShpa7`8j`
z2g?fI;l4L0PyD#Ug{}&d5KdK=sU`Kq6})7L(8<Y5UTAglQt?d8PR6owIx3>5OI;;(
z<JPqYv=!yA{M>rFbvzxiA)H}b<PWZS_u%xen~S(VmS|PG3vbGHx+#EqvbSlYE2vuv
z2+oq*bwdVEzZJOLaja9rIEEO0DLZZtDvcECgQ2J2Zk<pc4jf+-TubVeeJChfNKjUO
z`5(V+d2h>mTi*L@+$0f&`UW@PdrF9FhEveEApcJmPk8?)pTMP%cOT#rZs)j8g$pHl
zSQ5B&?$2|K|J79hr*kH8)ZXD6Ktlf}fL}85c|U+#g-C{IpE9`p^#}NKNSmVqWWyx_
z<*scmt%1`OW-u=N&@Z>ib<v(Nk<o^LQ{@Ine1X5WjioHSHOH+}`2$uTdaL8^Q8y0#
zE5}zQJ#hLrXJ`JbSY(Q&V_bdQTL}TT8*$u-I=At0FN%6`%hP4pN@U__u8RqNmFzmB
zqs_Z!xm<<{7o0o`m7e+M%;?}g#$Sk)bd3(;cDLC%ZbtrzQ+g+4dO9s}qBq9=IkzTV
zKhvhH$n4lMJ;O&JTX!6ybp6PaX|+W5>a}y+`Kc$sEgCn?3c>r@x^!y+)E<kIKGua-
z`{0-@n>qS-Xyw&qV=#z2L}hG|uEz0UYxB|Fy6ARG_IqK3-$e-4V#}A9N479@N{!~8
z`_nvajgvV~?o5$8kHp6@OD2=3&5x#s>=3&dKZ<yZPg|WRiK5!>&ez?qyMF_Y)4Y8y
G4*&ofDk8)H

literal 1659
zcmV->288(^iwFp1x=ULC17u-zVP9lrb7OL8aCB*JZZ2wbZ*Bn9S6OcxHxPcdf5kun
z0@+!uwUTxENtNT&vE3M!+oFg9h1#Vg#=E3IawRK4{(EQ0)#bQIlRovqu$G+Tn|t`>
z@Sv11%SMLVQNm5E#oP)dOI`SKcyLe>3H~abRy)mRDWC(%GEPS~BIdVRSuUi%Sa5AT
zW9ihN?29WMANO)QQ~LP03;mR72EU4F&NU0*irE=YIhXJ@gf~nlLUN7A`t!<swrWDh
zeW47%P#zx-RIX!AV&4DWzx+_1Svxn~;mPoX>3Aj<JQT4B^H_vD$%iLdIy@m0(Lysk
zA>6MD;ICdk&NVj#`v@vft40m*DsEGb^biZ{^b|m$A8gRU)!V>hx_S#`Vm*QZPbZak
zddj5uEaVg}IJ3FtaKq*XE)y=Tn5>{sZlJG)6){V}D!ArMOX3j{7~H5Fwcn_Or-0U<
ztEGg&N+w!mJcQe8HGp@&y@se-ZXR`kSEE`dsx`x*9LlXpkn#mj4GCzqV7}Ixp=Agv
zAx-FYh`HutjVNNJEq^5PHpD42hCAL^Nvl-~OeP+PDS$wRt_1!#R|ch_4MfQwElkiB
z`bY$Y9*qjnB=r9S<OQV@Zs?zi1+y431nV8(jJQW!6f(e=kt8FGbzlfE|Bi*jlj0X-
zDiIU(Fu^d$%PbKkd5kC$Zh36kIOSl(R5BD(I7H?#fq{u;F>!h)kZ(djU>Hw^fA#jp
zGtLj>p86?REsVkge^}mg*iirzrO6k{G8foZ&C!K$Dc7cPEiz0L696V-5i559OH!@*
z6n()de#p;RJX_lh$O?vK^OR%GQvwxmQ78ABcL4{aaAV73)Cd468m;P{(!kk8ynBZ2
zF`sUjqkU$lABxg*${8i#j{Q-!z0cx|C=3eWHJP__4Kb52<~}15=LClGBt|abb1tdE
zp1NS2PFzf9b~mqm)X)Imrp732%qTxG;uBxE2273!P{))@Wmj`Fe9UKTAt;RK7^{(x
zkcH5TF-H=rV9f6=xzn<R?cBfP%_n?o;Q^RUf#C_q3+#=?iY!l=^)=)Q2FgXzRm@Dq
z4@VA_qTm?Mh&Z6^?kWYPr@?$dwdm|^HM#_xOt31}m6qsyivG_SHZIADNAp-=qjzwT
zfQ>~e>}m^#N@zb(o-;(qPpAdWSPDDaYI+8{k6+(Mt&1NYKYqmi3~{ADsNlhVMnB`A
z;E6KBQZl97aV$r$5tjXgu&ry#l=?dY2mNo+Q7C>IAqxBk;lC94Pbh8SLm#l!kNdFo
zR9N;C!gkPg_T+gN?#ufRF0_RUe@fu9AYCf@=E6h<!VtSyPkED%Y2lz!+KgT|fn3WG
z)N(Q*Q7V>%T~bxJVeB*Ar~Qi;FQ8`;@Hn>|M+COv%+0bc-8%JAu_|1iBS$R^TW$D*
zrMd5L-y4)?e%#?gw-1vLPE?wzCH2G=ykxP^@!3nBYjyTg@l;IDMsn|Tm{+SE>MDsF
zx2`>)ttfxx=hoA$<LQtM;R4$te{{{e3+I2`T}8c-M5|gIcvG~~Z2^?yqiq{qLD^D3
zaF*P!8!~wMoxo*|W1SktF+}i7(Q&)5*GQq<A9(uR)(Pc7-|^MJwWMy*hk~Mo1V!Z+
z|MA-v_qMpV#l6qQZ4yza?{EWNP(s`?oPx#$`G2x_!uvn@7_NnU_yC`9JI8e@oGZ!0
zg21J7Z<b;FZ*BrOpD~G}_5t4j5_)$5{F0K-djZ@lL^448l)>$<*T<g&+8h-i8!izj
zcWrZF4V<qqgK^=9Ua?KCi}r$vlr{vMDt9>IbNsz)EM?)nX*SN~4_ICIR!5!FP89lA
zj;~63;Pmb;F8o<OPZdi>xca!a5&~{jqNq}B-$%`Ewc3puo-V~!B4bB$n~(9UVAm-f
zZQnJ^<uX*b;ACm2^wd9Rh9`>%e<4=VH9U!0om#tDtNJHS;hm7_$)v!E-Wd7k+?sU#
zOq;MgwIj>)6d!?X-EoN0^&?NF)e_nFubtyA&piQd(YR^u5xg(0OScq2X;Yl^u`ayQ
z2ghvL)X~2~D{rnV{eILYDr56xHHzwu?MG+lqSGnZ7s3d?ix8~&jxRBfY;Ne38qI?H
z(=2I>;u()0OrFtQJw9!9ShL-3j3=ymT5oq+QN7)+wd*X3dA(LTJotL}^>27iFrF<B
F0013RDU1LB

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 14f236929bb51..8ce2e2839fe6a 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1189,15 +1189,13 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
 
 
 def test_fetch_openml_verify_checksum(monkeypatch):
-    # locally, titanic dataset is not trimmed, hence a valid
-    # candidate for a checksum test failure
-    data_id = 40945
+    data_id = 2
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
 
     # modify a mocked file content to change checksum
     dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id))
     original_data_path = os.path.join(dataset_dir,
-                                      'data-v1-download-16826755.arff.gz')
+                                      'data-v1-download-1666876.arff.gz')
     backup_data_path = original_data_path + ".back"
     shutil.copy(original_data_path, backup_data_path)
 
@@ -1210,7 +1208,6 @@ def test_fetch_openml_verify_checksum(monkeypatch):
 
         # succeeds if checksum validation is explicitly overriden
         dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
-                                                as_frame=True,
                                                 verify_checksum=False)
         assert dataset
 

From 831d78b57bbc9f5f87175ad502d9aa37710d6193 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Mon, 9 Sep 2019 20:12:39 -0400
Subject: [PATCH 05/25] return bytes instead of stream, read once

---
 sklearn/datasets/openml.py            | 70 +++++++++++++--------------
 sklearn/datasets/tests/test_openml.py | 10 ++--
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index a56d1d00e6072..721d879d1e1f0 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -1,9 +1,7 @@
 import gzip
 import json
 import os
-from io import BytesIO
 import hashlib
-import shutil
 from os.path import join
 from warnings import warn
 from contextlib import closing
@@ -63,7 +61,7 @@ def wrapper():
     return decorator
 
 
-def _open_openml_url(openml_path, data_home, expected_md5_checksum=None):
+def _openml_url_bytes(openml_path, data_home, expected_md5_checksum=None):
     """
     Returns a resource from OpenML.org. Caches it to data_home if required.
 
@@ -79,8 +77,8 @@ def _open_openml_url(openml_path, data_home, expected_md5_checksum=None):
 
     Returns
     -------
-    result : stream
-        A stream to the OpenML resource
+    result : bytes
+        Byte content of resource
     """
     def is_gzip(_fsrc):
         return _fsrc.info().get('Content-Encoding', '') == 'gzip'
@@ -88,40 +86,38 @@ def is_gzip(_fsrc):
     req = Request(_OPENML_PREFIX + openml_path)
     req.add_header('Accept-encoding', 'gzip')
 
-    def _md5_validated_stream(input_stream, md5_checksum):
+    def _md5_validated_bytes(bytes_content, md5_checksum):
         """
         Consume binary stream to validate checksum,
         return a new stream with same content
 
         Parameters
         ----------
-        input_stream : io.BufferedIOBase
-            Input stream with a read() method to get content in bytes
+        bytes_content : bytes
 
         md5_checksum: str
-            Expected md5 checksum
+            Expected md5 checksum of bytes
 
         Returns
         -------
-        BytesIO stream with the same content as input_stream for consumption
+        bytes
         """
-        with closing(input_stream):
-            bytes_content = input_stream.read()
-            actual_md5_checksum = hashlib.md5(bytes_content).hexdigest()
-            if md5_checksum != actual_md5_checksum:
-                raise ValueError("md5checksum: {} does not match expected: "
-                                 "{}".format(actual_md5_checksum,
-                                             md5_checksum))
-            return BytesIO(bytes_content)
+        actual_md5_checksum = hashlib.md5(bytes_content).hexdigest()
+        if md5_checksum != actual_md5_checksum:
+            raise ValueError("md5checksum: {} does not match expected: "
+                             "{}".format(actual_md5_checksum,
+                                         md5_checksum))
+        return bytes_content
 
     if data_home is None:
         fsrc = urlopen(req)
         if is_gzip(fsrc):
             fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
+        bytes_content = fsrc.read()
         if expected_md5_checksum:
             # validating checksum reads and consumes the stream
-            return _md5_validated_stream(fsrc, expected_md5_checksum)
-        return fsrc
+            return _md5_validated_bytes(bytes_content, expected_md5_checksum)
+        return bytes_content
 
     local_path = _get_local_path(openml_path, data_home)
     if not os.path.exists(local_path):
@@ -135,18 +131,23 @@ def _md5_validated_stream(input_stream, md5_checksum):
             with closing(urlopen(req)) as fsrc:
                 if is_gzip(fsrc):   # unzip it for checksum validation
                     fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
+                bytes_content = fsrc.read()
                 if expected_md5_checksum:
-                    fsrc = _md5_validated_stream(fsrc, expected_md5_checksum)
+                    bytes_content = _md5_validated_bytes(bytes_content,
+                                                         expected_md5_checksum)
                 with gzip.GzipFile(local_path, 'wb') as fdst:
-                    shutil.copyfileobj(fsrc, fdst)
+                    fdst.write(bytes_content)
         except Exception:
             if os.path.exists(local_path):
                 os.unlink(local_path)
             raise
+    else:
+        with gzip.GzipFile(local_path, "rb") as gzip_file:
+            bytes_content = gzip_file.read()
 
     # XXX: First time, decompression will not be necessary (by using fsrc), but
     # it will happen nonetheless
-    return gzip.GzipFile(local_path, 'rb')
+    return bytes_content
 
 
 def _get_json_content_from_openml_api(url, error_message, raise_if_error,
@@ -183,8 +184,7 @@ def _get_json_content_from_openml_api(url, error_message, raise_if_error,
 
     @_retry_with_clean_cache(url, data_home)
     def _load_json():
-        with closing(_open_openml_url(url, data_home)) as response:
-            return json.loads(response.read().decode("utf-8"))
+        return json.loads(_openml_url_bytes(url, data_home).decode("utf-8"))
 
     try:
         return _load_json()
@@ -489,16 +489,16 @@ def _download_data_arff(file_id, sparse, data_home, encode_nominal=True,
 
     @_retry_with_clean_cache(url, data_home)
     def _arff_load():
-        with closing(_open_openml_url(url, data_home, expected_md5_checksum)) \
-                as response:
-            if sparse is True:
-                return_type = _arff.COO
-            else:
-                return_type = _arff.DENSE_GEN
-
-            arff_file = _arff.loads(response.read().decode('utf-8'),
-                                    encode_nominal=encode_nominal,
-                                    return_type=return_type)
+        bytes_content = _openml_url_bytes(url, data_home,
+                                          expected_md5_checksum)
+        if sparse is True:
+            return_type = _arff.COO
+        else:
+            return_type = _arff.DENSE_GEN
+
+        arff_file = _arff.loads(bytes_content.decode('utf-8'),
+                                encode_nominal=encode_nominal,
+                                return_type=return_type)
         return arff_file
 
     return _arff_load()
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 8ce2e2839fe6a..53cc2df71d240 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -12,7 +12,7 @@
 
 from sklearn import config_context
 from sklearn.datasets import fetch_openml
-from sklearn.datasets.openml import (_open_openml_url,
+from sklearn.datasets.openml import (_openml_url_bytes,
                                      _get_data_description_by_id,
                                      _download_data_arff,
                                      _get_local_path,
@@ -922,13 +922,13 @@ def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
     openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
     cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
     # first fill the cache
-    response1 = _open_openml_url(openml_path, cache_directory)
+    response1 = _openml_url_bytes(openml_path, cache_directory)
     # assert file exists
     location = _get_local_path(openml_path, cache_directory)
     assert os.path.isfile(location)
     # redownload, to utilize cache
-    response2 = _open_openml_url(openml_path, cache_directory)
-    assert response1.read() == response2.read()
+    response2 = _openml_url_bytes(openml_path, cache_directory)
+    assert response1 == response2
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -949,7 +949,7 @@ def _mock_urlopen(request):
     monkeypatch.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen)
 
     with pytest.raises(ValueError, match="Invalid request"):
-        _open_openml_url(openml_path, cache_directory)
+        _openml_url_bytes(openml_path, cache_directory)
 
     assert not os.path.exists(location)
 

From 10ecf9a89528c39fe47a9a62d59d4fc4f99d34bc Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sun, 29 Sep 2019 21:54:50 -0400
Subject: [PATCH 06/25] read and update md5 in chunks

---
 sklearn/datasets/openml.py | 62 +++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 721d879d1e1f0..612d769db94dc 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -86,38 +86,58 @@ def is_gzip(_fsrc):
     req = Request(_OPENML_PREFIX + openml_path)
     req.add_header('Accept-encoding', 'gzip')
 
-    def _md5_validated_bytes(bytes_content, md5_checksum):
+    def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512):
         """
-        Consume binary stream to validate checksum,
-        return a new stream with same content
+        Takes in a byte-stream, reads in chunks and returns bytes.
+        If expected_md5 is not none, keeps md5 checksum state while streaming
+        and validates post stream consumption.
 
         Parameters
         ----------
-        bytes_content : bytes
+        fsrc : io.BufferedIOBase
+            input stream to read bytes from
 
-        md5_checksum: str
-            Expected md5 checksum of bytes
+        expected_md5 : str
+            expected md5-checksum value
+
+        chunk_size : int
+            size of chunks to read at a time from stream
 
         Returns
         -------
-        bytes
+        fsrc_bytes : bytes
+            equivalent to fsrc_bytes.read() but with md5 validation if
+            expected_md5 is provided
+
+        Raises
+        ------
+        ValueError :
+            if expected_md5 does not match actual md5-checksum of stream
         """
-        actual_md5_checksum = hashlib.md5(bytes_content).hexdigest()
-        if md5_checksum != actual_md5_checksum:
+        fsrc_bytes = bytes()
+        file_md5 = hashlib.md5() if expected_md5_checksum else None
+        while True:
+            data = fsrc.read(chunk_size)
+            if not data:
+                break
+            if expected_md5:
+                file_md5.update(data)
+            fsrc_bytes += data
+
+        if expected_md5 and file_md5.hexdigest() != expected_md5:
             raise ValueError("md5checksum: {} does not match expected: "
-                             "{}".format(actual_md5_checksum,
-                                         md5_checksum))
-        return bytes_content
+                             "{}".format(file_md5.hexdigest(),
+                                         expected_md5))
+        return fsrc_bytes
 
     if data_home is None:
         fsrc = urlopen(req)
         if is_gzip(fsrc):
             fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
-        bytes_content = fsrc.read()
-        if expected_md5_checksum:
-            # validating checksum reads and consumes the stream
-            return _md5_validated_bytes(bytes_content, expected_md5_checksum)
-        return bytes_content
+        return _md5_validated_bytestream(
+            fsrc,
+            expected_md5=expected_md5_checksum
+        )
 
     local_path = _get_local_path(openml_path, data_home)
     if not os.path.exists(local_path):
@@ -131,10 +151,10 @@ def _md5_validated_bytes(bytes_content, md5_checksum):
             with closing(urlopen(req)) as fsrc:
                 if is_gzip(fsrc):   # unzip it for checksum validation
                     fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
-                bytes_content = fsrc.read()
-                if expected_md5_checksum:
-                    bytes_content = _md5_validated_bytes(bytes_content,
-                                                         expected_md5_checksum)
+                bytes_content = _md5_validated_bytestream(
+                    fsrc,
+                    expected_md5=expected_md5_checksum
+                )
                 with gzip.GzipFile(local_path, 'wb') as fdst:
                     fdst.write(bytes_content)
         except Exception:

From f8c8fe4c1aa88d03cbe03e7afad0c5708938e0e0 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Thu, 10 Oct 2019 20:19:06 -0400
Subject: [PATCH 07/25] bytearray extend while chunked construction

---
 sklearn/datasets/openml.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 612d769db94dc..2cf5b85fdd6b1 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -114,7 +114,7 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512):
         ValueError :
             if expected_md5 does not match actual md5-checksum of stream
         """
-        fsrc_bytes = bytes()
+        fsrc_bytes = bytearray()
         file_md5 = hashlib.md5() if expected_md5_checksum else None
         while True:
             data = fsrc.read(chunk_size)
@@ -122,13 +122,13 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512):
                 break
             if expected_md5:
                 file_md5.update(data)
-            fsrc_bytes += data
+            fsrc_bytes.extend(data)
 
         if expected_md5 and file_md5.hexdigest() != expected_md5:
             raise ValueError("md5checksum: {} does not match expected: "
                              "{}".format(file_md5.hexdigest(),
                                          expected_md5))
-        return fsrc_bytes
+        return bytes(fsrc_bytes)
 
     if data_home is None:
         fsrc = urlopen(req)

From 3c5ab3e1a69cba565f8d70ade947771b1f70169f Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Fri, 11 Oct 2019 19:10:58 -0400
Subject: [PATCH 08/25] Update sklearn/datasets/openml.py

return bytearray instead of new bytes

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>
---
 sklearn/datasets/openml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 2cf5b85fdd6b1..1820c759657a5 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -128,7 +128,7 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512):
             raise ValueError("md5checksum: {} does not match expected: "
                              "{}".format(file_md5.hexdigest(),
                                          expected_md5))
-        return bytes(fsrc_bytes)
+        return fsrc_bytes
 
     if data_home is None:
         fsrc = urlopen(req)

From db159a5f6f587a924d844b5b0492ad64cb887f0f Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Mon, 2 Dec 2019 19:06:02 -0500
Subject: [PATCH 09/25] add early exiting

---
 sklearn/datasets/_openml.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index c3e10f2c828cf..47ca1b604be88 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -114,17 +114,19 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512):
         ValueError :
             if expected_md5 does not match actual md5-checksum of stream
         """
+        if expected_md5 is None:
+            return fsrc.read()
+
         fsrc_bytes = bytearray()
-        file_md5 = hashlib.md5() if expected_md5_checksum else None
+        file_md5 = hashlib.md5()
         while True:
             data = fsrc.read(chunk_size)
             if not data:
                 break
-            if expected_md5:
-                file_md5.update(data)
+            file_md5.update(data)
             fsrc_bytes.extend(data)
 
-        if expected_md5 and file_md5.hexdigest() != expected_md5:
+        if file_md5.hexdigest() != expected_md5:
             raise ValueError("md5checksum: {} does not match expected: "
                              "{}".format(file_md5.hexdigest(),
                                          expected_md5))

From f745e2113d2eb0339b400056348d35c9fe3d9bb9 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Fri, 3 Jan 2020 20:36:26 -0500
Subject: [PATCH 10/25] revert back to simple case

---
 sklearn/datasets/_openml.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 47ca1b604be88..88ebd504b2d64 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -114,23 +114,16 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512):
         ValueError :
             if expected_md5 does not match actual md5-checksum of stream
         """
+        bytes = fsrc.read()
+
         if expected_md5 is None:
-            return fsrc.read()
-
-        fsrc_bytes = bytearray()
-        file_md5 = hashlib.md5()
-        while True:
-            data = fsrc.read(chunk_size)
-            if not data:
-                break
-            file_md5.update(data)
-            fsrc_bytes.extend(data)
-
-        if file_md5.hexdigest() != expected_md5:
+            return bytes
+
+        file_md5 = hashlib.md5(bytes).hexdigest()
+        if file_md5 != expected_md5:
             raise ValueError("md5checksum: {} does not match expected: "
-                             "{}".format(file_md5.hexdigest(),
-                                         expected_md5))
-        return fsrc_bytes
+                             "{}".format(file_md5, expected_md5))
+        return bytes
 
     if data_home is None:
         fsrc = urlopen(req)

From 343303178773d98430d44ac0306f001714f15b0e Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sat, 2 May 2020 23:43:39 -0400
Subject: [PATCH 11/25] Merge upstream, verify checksum while yielding stream

---
 sklearn/datasets/_openml.py           | 343 +++++++++++++-------------
 sklearn/datasets/tests/test_openml.py | 111 ++++-----
 2 files changed, 220 insertions(+), 234 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 88ebd504b2d64..c5a8da376403d 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -1,6 +1,7 @@
 import gzip
 import json
 import os
+import shutil
 import hashlib
 from os.path import join
 from warnings import warn
@@ -9,6 +10,7 @@
 import itertools
 from collections.abc import Generator
 from collections import OrderedDict
+from functools import partial
 
 from urllib.request import urlopen, Request
 
@@ -22,6 +24,7 @@
 from ..utils import get_chunk_n_rows
 from ..utils import _chunk_generator
 from ..utils import check_pandas_support  # noqa
+from ..utils.validation import _deprecate_positional_args
 
 __all__ = ['fetch_openml']
 
@@ -44,11 +47,11 @@ def _retry_with_clean_cache(openml_path, data_home):
     """
     def decorator(f):
         @wraps(f)
-        def wrapper():
+        def wrapper(*args, **kw):
             if data_home is None:
-                return f()
+                return f(*args, **kw)
             try:
-                return f()
+                return f(*args, **kw)
             except HTTPError:
                 raise
             except Exception:
@@ -56,12 +59,12 @@ def wrapper():
                 local_path = _get_local_path(openml_path, data_home)
                 if os.path.exists(local_path):
                     os.unlink(local_path)
-                return f()
+                return f(*args, **kw)
         return wrapper
     return decorator
 
 
-def _open_openml_url(openml_path, data_home, expected_md5_checksum=None):
+def _open_openml_url(openml_path, data_home):
     """
     Returns a resource from OpenML.org. Caches it to data_home if required.
 
@@ -77,8 +80,8 @@ def _open_openml_url(openml_path, data_home, expected_md5_checksum=None):
 
     Returns
     -------
-    result : bytes
-        Byte content of resource
+    result : stream
+        A stream to the OpenML resource
     """
     def is_gzip(_fsrc):
         return _fsrc.info().get('Content-Encoding', '') == 'gzip'
@@ -86,53 +89,11 @@ def is_gzip(_fsrc):
     req = Request(_OPENML_PREFIX + openml_path)
     req.add_header('Accept-encoding', 'gzip')
 
-    def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512):
-        """
-        Takes in a byte-stream, reads in chunks and returns bytes.
-        If expected_md5 is not none, keeps md5 checksum state while streaming
-        and validates post stream consumption.
-
-        Parameters
-        ----------
-        fsrc : io.BufferedIOBase
-            input stream to read bytes from
-
-        expected_md5 : str
-            expected md5-checksum value
-
-        chunk_size : int
-            size of chunks to read at a time from stream
-
-        Returns
-        -------
-        fsrc_bytes : bytes
-            equivalent to fsrc_bytes.read() but with md5 validation if
-            expected_md5 is provided
-
-        Raises
-        ------
-        ValueError :
-            if expected_md5 does not match actual md5-checksum of stream
-        """
-        bytes = fsrc.read()
-
-        if expected_md5 is None:
-            return bytes
-
-        file_md5 = hashlib.md5(bytes).hexdigest()
-        if file_md5 != expected_md5:
-            raise ValueError("md5checksum: {} does not match expected: "
-                             "{}".format(file_md5, expected_md5))
-        return bytes
-
     if data_home is None:
         fsrc = urlopen(req)
         if is_gzip(fsrc):
-            fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
-        return _md5_validated_bytestream(
-            fsrc,
-            expected_md5=expected_md5_checksum
-        )
+            return gzip.GzipFile(fileobj=fsrc, mode='rb')
+        return fsrc
 
     local_path = _get_local_path(openml_path, data_home)
     if not os.path.exists(local_path):
@@ -144,25 +105,20 @@ def _md5_validated_bytestream(fsrc, expected_md5=None, chunk_size=512):
 
         try:
             with closing(urlopen(req)) as fsrc:
-                if is_gzip(fsrc):   # unzip it for checksum validation
-                    fsrc = gzip.GzipFile(fileobj=fsrc, mode='rb')
-                bytes_content = _md5_validated_bytestream(
-                    fsrc,
-                    expected_md5=expected_md5_checksum
-                )
-                with gzip.GzipFile(local_path, 'wb') as fdst:
-                    fdst.write(bytes_content)
+                if is_gzip(fsrc):
+                    with open(local_path, 'wb') as fdst:
+                        shutil.copyfileobj(fsrc, fdst)
+                else:
+                    with gzip.GzipFile(local_path, 'wb') as fdst:
+                        shutil.copyfileobj(fsrc, fdst)
         except Exception:
             if os.path.exists(local_path):
                 os.unlink(local_path)
             raise
-    else:
-        with gzip.GzipFile(local_path, "rb") as gzip_file:
-            bytes_content = gzip_file.read()
 
     # XXX: First time, decompression will not be necessary (by using fsrc), but
     # it will happen nonetheless
-    return bytes_content
+    return gzip.GzipFile(local_path, 'rb')
 
 
 def _get_json_content_from_openml_api(url, error_message, raise_if_error,
@@ -199,7 +155,8 @@ def _get_json_content_from_openml_api(url, error_message, raise_if_error,
 
     @_retry_with_clean_cache(url, data_home)
     def _load_json():
-        return json.loads(_open_openml_url(url, data_home).decode("utf-8"))
+        with closing(_open_openml_url(url, data_home)) as response:
+            return json.loads(response.read().decode("utf-8"))
 
     try:
         return _load_json()
@@ -263,7 +220,7 @@ def _sparse_data_to_array(arff_data, include_columns):
     return y
 
 
-def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
+def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None):
     """
     converts the arff object into the appropriate matrix type (np.array or
     scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
@@ -271,8 +228,8 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
 
     Parameters
     ----------
-    arff_data : list or dict
-        as obtained from liac-arff object
+    arff : dict
+        As obtained from liac-arff object.
 
     col_slice_x : list
         The column indices that are sliced from the original array to return
@@ -287,6 +244,7 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
     X : np.array or scipy.sparse.csr_matrix
     y : np.array
     """
+    arff_data = arff['data']
     if isinstance(arff_data, Generator):
         if shape[0] == -1:
             count = -1
@@ -346,7 +304,8 @@ def _convert_arff_data_dataframe(arff, columns, features_dict):
 
     Returns
     -------
-    dataframe : pandas DataFrame
+    result : tuple
+        tuple with the resulting dataframe
     """
     pd = check_pandas_support('fetch_openml with as_frame=True')
 
@@ -373,7 +332,7 @@ def _convert_arff_data_dataframe(arff, columns, features_dict):
         if dtype == 'category':
             dtype = pd.api.types.CategoricalDtype(attributes[column])
         df[column] = df[column].astype(dtype, copy=False)
-    return df
+    return (df, )
 
 
 def _get_data_info_by_name(name, version, data_home):
@@ -494,29 +453,131 @@ def _get_num_samples(data_qualities):
     return int(float(qualities.get('NumberOfInstances', default_n_samples)))
 
 
-def _download_data_arff(file_id, sparse, data_home, encode_nominal=True,
-                        expected_md5_checksum=None):
-    # Accesses an ARFF file on the OpenML server. Documentation:
-    # https://www.openml.org/api_data_docs#!/data/get_download_id
-    # encode_nominal argument is to ensure unit testing, do not alter in
-    # production!
-    url = _DATA_FILE.format(file_id)
+def _load_arff_response(url, data_home, return_type, encode_nominal,
+                        parse_arff, md5_checksum):
+    """Load arff data with url and parses arff response with parse_arff"""
+    response = _open_openml_url(url, data_home)
+
+    with closing(response):
+        # Note that if the data is dense, no reading is done until the data
+        # generator is iterated.
+        actual_md5_checksum = hashlib.md5()
+
+        def _stream_checksum_generator(response):
+            for line in response:
+                actual_md5_checksum.update(line)
+                yield line.decode('utf-8')
+            # stream consumed, check md5
+            if actual_md5_checksum.hexdigest() != md5_checksum:
+                raise ValueError("md5 checksum of local file for " + url
+                                 + " does not match description. "
+                                 "Downloaded file could have been modified / corrupted,"
+                                 "clean cache and retry...")
+
+        arff = _arff.load(_stream_checksum_generator(response),
+                          return_type=return_type,
+                          encode_nominal=encode_nominal)
+        return parse_arff(arff)
+
+
+def _download_data_to_bunch(url, sparse, data_home, *, as_frame, features_list, data_columns, target_columns, shape,
+                            md5_checksum):
+    """Download OpenML ARFF and convert to Bunch of data"""
+    # NB: this function is long in order to handle retry for any failure
+    #     during the streaming parse of the ARFF.
+
+    # Prepare which columns and data types should be returned for the X and y
+    features_dict = {feature['name']: feature for feature in features_list}
 
-    @_retry_with_clean_cache(url, data_home)
-    def _arff_load():
-        bytes_content = _open_openml_url(url, data_home,
-                                         expected_md5_checksum)
-        if sparse is True:
-            return_type = _arff.COO
-        else:
-            return_type = _arff.DENSE_GEN
+    # XXX: col_slice_y should be all nominal or all numeric
+    _verify_target_data_type(features_dict, target_columns)
+
+    col_slice_y = [int(features_dict[col_name]['index'])
+                   for col_name in target_columns]
+
+    col_slice_x = [int(features_dict[col_name]['index'])
+                   for col_name in data_columns]
+    for col_idx in col_slice_y:
+        feat = features_list[col_idx]
+        nr_missing = int(feat['number_of_missing_values'])
+        if nr_missing > 0:
+            raise ValueError('Target column {} has {} missing values. '
+                             'Missing values are not supported for target '
+                             'columns. '.format(feat['name'], nr_missing))
 
-        arff_file = _arff.loads(bytes_content.decode('utf-8'),
-                                encode_nominal=encode_nominal,
-                                return_type=return_type)
-        return arff_file
+    # Access an ARFF file on the OpenML server. Documentation:
+    # https://www.openml.org/api_data_docs#!/data/get_download_id
 
-    return _arff_load()
+    if sparse is True:
+        return_type = _arff.COO
+    else:
+        return_type = _arff.DENSE_GEN
+
+    frame = nominal_attributes = None
+    if as_frame:
+        columns = data_columns + target_columns
+        parse_arff = partial(_convert_arff_data_dataframe, columns=columns,
+                             features_dict=features_dict)
+
+        def postprocess(frame):  # type:ignore
+            X = frame[data_columns]
+            if len(target_columns) >= 2:
+                y = frame[target_columns]
+            elif len(target_columns) == 1:
+                y = frame[target_columns[0]]
+            else:
+                y = None
+            return X, y, frame, nominal_attributes
+    else:
+        def parse_arff(arff):
+            X, y = _convert_arff_data(arff, col_slice_x, col_slice_y, shape)
+            # nominal attributes is a dict mapping from the attribute name to
+            # the possible values. Includes also the target column (which will
+            # be popped off below, before it will be packed in the Bunch
+            # object)
+            nominal_attributes = {k: v for k, v in arff['attributes']
+                                  if isinstance(v, list) and
+                                  k in data_columns + target_columns}
+            return X, y, nominal_attributes
+
+        def postprocess(X, y, nominal_attributes):  # type:ignore
+            is_classification = {col_name in nominal_attributes
+                                 for col_name in target_columns}
+            if not is_classification:
+                # No target
+                pass
+            elif all(is_classification):
+                y = np.hstack([
+                    np.take(
+                        np.asarray(nominal_attributes.pop(col_name),
+                                   dtype='O'),
+                        y[:, i:i + 1].astype(int, copy=False))
+                    for i, col_name in enumerate(target_columns)
+                ])
+            elif any(is_classification):
+                raise ValueError('Mix of nominal and non-nominal targets is '
+                                 'not currently supported')
+
+            # reshape y back to 1-D array, if there is only 1 target column;
+            # back to None if there are not target columns
+            if y.shape[1] == 1:
+                y = y.reshape((-1,))
+            elif y.shape[1] == 0:
+                y = None
+            return X, y, frame, nominal_attributes
+
+    out = _retry_with_clean_cache(url, data_home)(
+        _load_arff_response)(url, data_home,
+                             return_type=return_type,
+                             encode_nominal=not as_frame,
+                             parse_arff=parse_arff,
+                             md5_checksum=md5_checksum)
+    X, y, frame, nominal_attributes = postprocess(*out)
+
+    return Bunch(data=X, target=y, frame=frame,
+                 categories=nominal_attributes,
+                 feature_names=data_columns,
+                 target_names=target_columns)
 
 
 def _verify_target_data_type(features_dict, target_columns):
@@ -561,9 +622,10 @@ def _valid_data_column_names(features_list, target_columns):
     return valid_data_column_names
 
 
-def fetch_openml(name=None, version='active', data_id=None, data_home=None,
+@_deprecate_positional_args
+def fetch_openml(name=None, *, version='active', data_id=None, data_home=None,
                  target_column='default-target', cache=True, return_X_y=False,
-                 as_frame=False, verify_checksum=True):
+                 as_frame=False):
     """Fetch dataset from openml by name or dataset id.
 
     Datasets are uniquely identified by either an integer ID or by a
@@ -574,6 +636,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
 
     Read more in the :ref:`User Guide <openml>`.
 
+    .. versionadded:: 0.20
+
     .. note:: EXPERIMENTAL
 
         The API is experimental (particularly the return value structure),
@@ -624,16 +688,11 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
         data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas
         DataFrames or Series as describe above.
 
-    verify_checksum : boolean, default=True
-        If True, verifies md5_checksum of file provided in /download/{id}
-        If cache=True, verification only happens during data download
-        from network.
-
     Returns
     -------
 
-    data : Bunch
-        Dictionary-like object, with attributes:
+    data : :class:`~sklearn.utils.Bunch`
+        Dictionary-like object, with the following attributes.
 
         data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame
             The feature matrix. Categorical features are encoded as ordinals.
@@ -756,25 +815,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
     data_columns = _valid_data_column_names(features_list,
                                             target_columns)
 
-    # prepare which columns and data types should be returned for the X and y
-    features_dict = {feature['name']: feature for feature in features_list}
-
-    # XXX: col_slice_y should be all nominal or all numeric
-    _verify_target_data_type(features_dict, target_columns)
-
-    col_slice_y = [int(features_dict[col_name]['index'])
-                   for col_name in target_columns]
-
-    col_slice_x = [int(features_dict[col_name]['index'])
-                   for col_name in data_columns]
-    for col_idx in col_slice_y:
-        feat = features_list[col_idx]
-        nr_missing = int(feat['number_of_missing_values'])
-        if nr_missing > 0:
-            raise ValueError('Target column {} has {} missing values. '
-                             'Missing values are not supported for target '
-                             'columns. '.format(feat['name'], nr_missing))
-
     # determine arff encoding to return
     if not return_sparse:
         # The shape must include the ignored features to keep the right indexes
@@ -785,69 +825,22 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
         shape = None
 
     # obtain the data
-    arff = _download_data_arff(data_description['file_id'],
-                               return_sparse,
-                               data_home,
-                               encode_nominal=not as_frame,
-                               expected_md5_checksum=data_description["md5_checksum"] if verify_checksum else None)  # noqa: E501
+    url = _DATA_FILE.format(data_description['file_id'])
+    bunch = _download_data_to_bunch(url, return_sparse, data_home,
+                                    as_frame=as_frame,
+                                    features_list=features_list, shape=shape,
+                                    target_columns=target_columns,
+                                    data_columns=data_columns,
+                                    md5_checksum=data_description["md5_checksum"])
+
+    if return_X_y:
+        return bunch.data, bunch.target
 
     description = "{}\n\nDownloaded from openml.org.".format(
         data_description.pop('description'))
 
-    nominal_attributes = None
-    frame = None
-    if as_frame:
-        columns = data_columns + target_columns
-        frame = _convert_arff_data_dataframe(arff, columns, features_dict)
-        X = frame[data_columns]
-        if len(target_columns) >= 2:
-            y = frame[target_columns]
-        elif len(target_columns) == 1:
-            y = frame[target_columns[0]]
-        else:
-            y = None
-    else:
-        # nominal attributes is a dict mapping from the attribute name to the
-        # possible values. Includes also the target column (which will be
-        # popped off below, before it will be packed in the Bunch object)
-        nominal_attributes = {k: v for k, v in arff['attributes']
-                              if isinstance(v, list) and
-                              k in data_columns + target_columns}
-
-        X, y = _convert_arff_data(arff['data'], col_slice_x,
-                                  col_slice_y, shape)
-
-        is_classification = {col_name in nominal_attributes
-                             for col_name in target_columns}
-        if not is_classification:
-            # No target
-            pass
-        elif all(is_classification):
-            y = np.hstack([
-                np.take(
-                    np.asarray(nominal_attributes.pop(col_name), dtype='O'),
-                    y[:, i:i + 1].astype(int, copy=False))
-                for i, col_name in enumerate(target_columns)
-            ])
-        elif any(is_classification):
-            raise ValueError('Mix of nominal and non-nominal targets is not '
-                             'currently supported')
-
-        # reshape y back to 1-D array, if there is only 1 target column; back
-        # to None if there are not target columns
-        if y.shape[1] == 1:
-            y = y.reshape((-1,))
-        elif y.shape[1] == 0:
-            y = None
-
-    if return_X_y:
-        return X, y
-
-    bunch = Bunch(
-        data=X, target=y, frame=frame, feature_names=data_columns,
-        target_names=target_columns,
+    bunch.update(
         DESCR=description, details=data_description,
-        categories=nominal_attributes,
         url="https://www.openml.org/d/{}".format(data_id))
 
     return bunch
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index c713e4087ed42..8187dc3feb9db 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -13,8 +13,9 @@
 from sklearn import config_context
 from sklearn.datasets import fetch_openml
 from sklearn.datasets._openml import (_open_openml_url,
+                                      _arff,
+                                      _DATA_FILE,
                                       _get_data_description_by_id,
-                                      _download_data_arff,
                                       _get_local_path,
                                       _retry_with_clean_cache,
                                       _feature_to_dtype)
@@ -49,8 +50,7 @@ def decode_column(data_bunch, col_idx):
             # non-nominal attribute
             return data_bunch.data[:, col_idx]
 
-    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None,
-                              verify_checksum=False)
+    data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None)
 
     # also obtain decoded arff
     data_description = _get_data_description_by_id(data_id, None)
@@ -58,8 +58,13 @@ def decode_column(data_bunch, col_idx):
     if sparse is True:
         raise ValueError('This test is not intended for sparse data, to keep '
                          'code relatively simple')
-    data_arff = _download_data_arff(data_description['file_id'],
-                                    sparse, None, False)
+    url = _DATA_FILE.format(data_description['file_id'])
+    with _open_openml_url(url, data_home=None) as f:
+        data_arff = _arff.load((line.decode('utf-8') for line in f),
+                               return_type=(_arff.COO if sparse
+                                            else _arff.DENSE_GEN),
+                               encode_nominal=False)
+
     data_downloaded = np.array(list(data_arff['data']), dtype='O')
 
     for i in range(len(data_bunch.feature_names)):
@@ -81,19 +86,18 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
     # result. Note that this function can be mocked (by invoking
     # _monkey_patch_webbased_functions before invoking this function)
     data_by_name_id = fetch_openml(name=data_name, version=data_version,
-                                   cache=False, verify_checksum=False)
+                                   cache=False)
     assert int(data_by_name_id.details['id']) == data_id
 
     # Please note that cache=False is crucial, as the monkey patched files are
     # not consistent with reality
-    fetch_openml(name=data_name, cache=False, verify_checksum=False)
+    fetch_openml(name=data_name, cache=False)
     # without specifying the version, there is no guarantee that the data id
     # will be the same
 
     # fetch with dataset id
     data_by_id = fetch_openml(data_id=data_id, cache=False,
-                              target_column=target_column,
-                              verify_checksum=False)
+                              target_column=target_column)
     assert data_by_id.details['name'] == data_name
     assert data_by_id.data.shape == (expected_observations, expected_features)
     if isinstance(target_column, str):
@@ -105,7 +109,7 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
         assert data_by_id.target.shape == (expected_observations,
                                            len(target_column))
         assert data_by_id.target_names == target_column
-    assert data_by_id.data.dtype == np.float64
+    assert data_by_id.data.dtype == expected_data_dtype
     assert data_by_id.target.dtype == expected_target_dtype
     assert len(data_by_id.feature_names) == expected_features
     for feature in data_by_id.feature_names:
@@ -120,13 +124,8 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
 
     if compare_default_target:
         # check whether the data by id and data by id target are equal
-        data_by_id_default = fetch_openml(data_id=data_id, cache=False,
-                                          verify_checksum=False)
-        if data_by_id.data.dtype == np.float64:
-            np.testing.assert_allclose(data_by_id.data,
-                                       data_by_id_default.data)
-        else:
-            assert np.array_equal(data_by_id.data, data_by_id_default.data)
+        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
+        np.testing.assert_allclose(data_by_id.data, data_by_id_default.data)
         if data_by_id.target.dtype == np.float64:
             np.testing.assert_allclose(data_by_id.target,
                                        data_by_id_default.target)
@@ -143,7 +142,7 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
 
     # test return_X_y option
     fetch_func = partial(fetch_openml, data_id=data_id, cache=False,
-                         target_column=target_column, verify_checksum=False)
+                         target_column=target_column)
     check_return_X_y(data_by_id, fetch_func)
     return data_by_id
 
@@ -184,6 +183,15 @@ def info(self):
                 return {'Content-Encoding': 'gzip'}
             return {}
 
+        def __iter__(self):
+            return iter(self.data)
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            return False
+
     def _file_name(url, suffix):
         return (re.sub(r'\W', '-', url[len("https://openml.org/"):])
                 + suffix + path_suffix)
@@ -407,8 +415,7 @@ def test_fetch_openml_anneal_pandas(monkeypatch):
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
 
     bunch = fetch_openml(data_id=data_id, as_frame=True,
-                         target_column=target_column, cache=False,
-                         verify_checksum=False)
+                         target_column=target_column, cache=False)
     data = bunch.data
     target = bunch.target
     frame = bunch.frame
@@ -492,8 +499,7 @@ def test_convert_arff_data_dataframe_warning_low_memory_pandas(monkeypatch):
     msg = 'Could not adhere to working_memory config.'
     with pytest.warns(UserWarning, match=msg):
         with config_context(working_memory=1e-6):
-            fetch_openml(data_id=data_id, as_frame=True, cache=False,
-                         verify_checksum=False)
+            fetch_openml(data_id=data_id, as_frame=True, cache=False)
 
 
 def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch):
@@ -510,7 +516,7 @@ def test_fetch_openml_adultcensus_pandas_return_X_y(monkeypatch):
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
     X, y = fetch_openml(data_id=data_id, as_frame=True, cache=False,
-                        return_X_y=True, verify_checksum=False)
+                        return_X_y=True)
     assert isinstance(X, pd.DataFrame)
     assert X.shape == data_shape
     n_categories = len([dtype for dtype in X.dtypes
@@ -539,8 +545,7 @@ def test_fetch_openml_adultcensus_pandas(monkeypatch):
     target_column = 'class'
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
-    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False,
-                         verify_checksum=False)
+    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
     data = bunch.data
     target = bunch.target
     frame = bunch.frame
@@ -578,8 +583,7 @@ def test_fetch_openml_miceprotein_pandas(monkeypatch):
     frame_n_floats = 77
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
-    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False,
-                         verify_checksum=False)
+    bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False)
     data = bunch.data
     target = bunch.target
     frame = bunch.frame
@@ -619,7 +623,7 @@ def test_fetch_openml_emotions_pandas(monkeypatch):
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
     bunch = fetch_openml(data_id=data_id, as_frame=True, cache=False,
-                         target_column=target_column, verify_checksum=False)
+                         target_column=target_column)
     data = bunch.data
     target = bunch.target
     frame = bunch.frame
@@ -748,7 +752,7 @@ def test_fetch_openml_iris_multitarget(monkeypatch, gzip_response):
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
-                               object, np.float64, expect_sparse=False,
+                               np.float64, np.float64, expect_sparse=False,
                                compare_default_target=False)
 
 
@@ -767,7 +771,7 @@ def test_fetch_openml_anneal(monkeypatch, gzip_response):
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
-                               object, object, expect_sparse=False,
+                               np.float64, object, expect_sparse=False,
                                compare_default_target=True)
 
 
@@ -792,7 +796,7 @@ def test_fetch_openml_anneal_multitarget(monkeypatch, gzip_response):
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
-                               object, object, expect_sparse=False,
+                               np.float64, object, expect_sparse=False,
                                compare_default_target=False)
 
 
@@ -810,7 +814,7 @@ def test_fetch_openml_cpu(monkeypatch, gzip_response):
     _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
                                expected_observations, expected_features,
                                expected_missing,
-                               object, np.float64, expect_sparse=False,
+                               np.float64, np.float64, expect_sparse=False,
                                compare_default_target=True)
 
 
@@ -934,7 +938,7 @@ def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
     assert os.path.isfile(location)
     # redownload, to utilize cache
     response2 = _open_openml_url(openml_path, cache_directory)
-    assert response1 == response2
+    assert response1.read() == response2.read()
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1011,8 +1015,7 @@ def _mock_urlopen_raise(request):
         monkeypatch, data_id, gzip_response)
     X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True,
                                         data_home=cache_directory,
-                                        return_X_y=True,
-                                        verify_checksum=False)
+                                        return_X_y=True)
 
     monkeypatch.setattr(sklearn.datasets._openml, 'urlopen',
                         _mock_urlopen_raise)
@@ -1045,13 +1048,12 @@ def test_fetch_openml_inactive(monkeypatch, gzip_response):
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     glas2 = assert_warns_message(
         UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
-        data_id=data_id, cache=False, verify_checksum=False)
+        data_id=data_id, cache=False)
     # fetch inactive dataset by name and version
     assert glas2.data.shape == (163, 9)
     glas2_by_version = assert_warns_message(
         UserWarning, "Version 1 of dataset glass2 is inactive,", fetch_openml,
-        data_id=None, name="glass2", version=1, cache=False,
-        verify_checksum=False)
+        data_id=None, name="glass2", version=1, cache=False)
     assert int(glas2_by_version.details['id']) == data_id
 
 
@@ -1062,8 +1064,7 @@ def test_fetch_nonexiting(monkeypatch, gzip_response):
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
     # Note that we only want to search by name (not data id)
     assert_raise_message(ValueError, "No active dataset glass2 found",
-                         fetch_openml, name='glass2', cache=False,
-                         verify_checksum=False)
+                         fetch_openml, name='glass2', cache=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1075,8 +1076,7 @@ def test_raises_illegal_multitarget(monkeypatch, gzip_response):
     assert_raise_message(ValueError,
                          "Can only handle homogeneous multi-target datasets,",
                          fetch_openml, data_id=data_id,
-                         target_column=targets, cache=False,
-                         verify_checksum=False)
+                         target_column=targets, cache=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1089,20 +1089,20 @@ def test_warn_ignore_attribute(monkeypatch, gzip_response):
     assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
                          fetch_openml, data_id=data_id,
                          target_column='MouseID',
-                         cache=False, verify_checksum=False)
+                         cache=False)
     assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
                          fetch_openml, data_id=data_id,
                          target_column='Genotype',
-                         cache=False, verify_checksum=False)
+                         cache=False)
     # multi column test
     assert_warns_message(UserWarning, expected_row_id_msg.format('MouseID'),
                          fetch_openml, data_id=data_id,
                          target_column=['MouseID', 'class'],
-                         cache=False, verify_checksum=False)
+                         cache=False)
     assert_warns_message(UserWarning, expected_ignore_msg.format('Genotype'),
                          fetch_openml, data_id=data_id,
                          target_column=['Genotype', 'class'],
-                         cache=False, verify_checksum=False)
+                         cache=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1113,8 +1113,7 @@ def test_string_attribute_without_dataframe(monkeypatch, gzip_response):
     assert_raise_message(ValueError,
                          ('STRING attributes are not supported for '
                           'array representation. Try as_frame=True'),
-                         fetch_openml, data_id=data_id, cache=False,
-                         verify_checksum=False)
+                         fetch_openml, data_id=data_id, cache=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
@@ -1125,7 +1124,7 @@ def test_dataset_with_openml_error(monkeypatch, gzip_response):
         UserWarning,
         "OpenML registered a problem with the dataset. It might be unusable. "
         "Error:",
-        fetch_openml, data_id=data_id, cache=False, verify_checksum=False
+        fetch_openml, data_id=data_id, cache=False
     )
 
 
@@ -1212,18 +1211,12 @@ def test_fetch_openml_verify_checksum(monkeypatch):
             data[len(data)-1] = 37
             modified_gzip.write(data)
 
-        # succeeds if checksum validation is explicitly overriden
-        dataset = sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
-                                                verify_checksum=False)
-        assert dataset
-
-        # fails with checksum validation (default)
+        # should fail checksum validation
         with pytest.raises(ValueError) as exc:
             sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
-                                          as_frame=True,
-                                          verify_checksum=True)
-
-        assert exc
+                                          as_frame=True)
+        # exception message should have file-path
+        assert "1666876" in repr(exc)
     finally:
         shutil.copy(backup_data_path, original_data_path)
         os.remove(backup_data_path)

From c7774ef76df968d2deb226aed2a7a66dd41d9287 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sun, 3 May 2020 01:16:36 -0400
Subject: [PATCH 12/25] Update checksum for truncated local test files

---
 .../data/openml/1/api-v1-json-data-1.json.gz  | Bin 1785 -> 1786 bytes
 .../openml/292/api-v1-json-data-292.json.gz   | Bin 547 -> 551 bytes
 .../openml/292/api-v1-json-data-40981.json.gz | Bin 547 -> 553 bytes
 3 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/sklearn/datasets/tests/data/openml/1/api-v1-json-data-1.json.gz b/sklearn/datasets/tests/data/openml/1/api-v1-json-data-1.json.gz
index f75912bf2def733f38da398a5155a89bc110dc51..ba544db4916374d870531fbfcdb4e88718569d30 100644
GIT binary patch
delta 1780
zcmV<Q1`GN54f+iTABzYG8&0lXkq8`rTXW;K4u0PJ3XETp%}}-_Ig;(^Ws>YIr`<hi
zob0x#J041+EH=88qU>fm{qF}9X<3S$hokW%67UfOK7s`C&&iuXL?vBSOs*nUg@Tuo
z7x`cg|D3#eGvE=v4eS9?A34pK`lNZzXsTr!CMt~|I9k%TGB`iKtYuP&^Yb}>L`6;|
ziy(N0cXY!eSV$&Pp09!C5quN$8LsMF@|*!I^YimXQHzkNk-ry$uX#>WID5FhdHcs%
zB4s&Wj+UeCb~`LemS^d(5bNbAT8^BzV1c%9n=)E42)S&`DB&`{Pk06RqfiP~iXvoH
zg<hFZpol>x48Eg~2o`_894M22%fSyi?C+K%`U5D=kTN*i4YJ=3;WAAj<5i^s+R(IS
zIE7#!1#6<FYR0_CnlKtYY`IJzr2$Lf^vBa7oc&T45{JcwfH0+k%V)HE#X?$R6w>&e
z7ZTP*5uy7ulssfat9DMQSEe)@HSZ7}aBS-*@Vt_YMv4e~2=`p3OojY^4hIJ~d=8gQ
zoGJ(<Tjt0aI785j0XI21mr=F6|IdOBwH<c|x3rSz5?$j|b^eqW+g$4=L%3rp+t6J0
zW<eD`mId$sE>oJDsc)dFtCEFM$G$~&Qx;3*%3V=a@Jr2eL}R#M&!#?51!US5J`*JV
zGYu1!Yd<q8a^>{5LOfM}Nl^{~ZcZae=mtqIAWT^Jq}S>{ERJCjCMnM{CR%F<ii$cw
zCN);MWFo#8!k0P=m{26&Aon!Ke4E2`Hq`*H3%Hb0@Sv6o;ZEDgJqQa$V_no$`fN4H
z6SZ0wgOqH|#7l)U+vVF{gFk<`pXUhEfN2rajHl0YXdlZWAFs%N$Dhc@1-bu19xut`
z2lDudJbosR3o{Or1b0(ZhtiVIC2K}s6Zk~nqe0;EWAd@IV;C%QM=r?&`H6fc3-XP8
zHCRlCkcvR9b^GjDFd$?%p(4t02hU#xA;XeoB@-)Aq$)e-YSw$BMK*9m#C2Njd|IGS
zzZ%4J2n|O8PjN$k&JDsQ4spqZTo}|$#U@z3mdUHG*yACtNN!>BYd5s1FGh>RSG9;~
z$gG&>yi$g4krvD7oxT3Anrde(@{YVGABdsYMv9p_sMC(S2hQ3^ZV+}?0iWO6d63m0
zepQQf2J%%c(iv=*ql|SCSDhe`%^-bsII@_%Q0}!3(ibLwqUa&6TP@O8CPmd7e%or1
zyb!Y~>Z<pak+zJA4L`0$y4c^POpj}ku9yi?9M^g(zl7*{wbv@TaAz(@0T#GKE=L<Y
zi<YAh>sB}MuW*xmCHqj3#I#oFHa}(*N&K@GMVuYex=3R6D#flM@pY}rz=W2)#l8OT
z&CdiDpWeTJH(Sa@GVufqpSq}*Tfs$6w>;`>mSZ5KHf4F=p92swT<{kuWs%=@+saiO
z7d@%~#`uu&h5F;EKfX6MXr#8J;Zw6e&BoChEXa6I*i+Z;De|BI?Wuc@mxoj9w0i}>
zVvPGWeTGx4-78v_DGw=@ixwm(7AJwM(Y!4xr{*qyvC^1RI8#Nn!g_>dE$H1VQntoB
zMa1#u#pJ%o9U50)tZ-XZ?bIcQ1{K44q)HEM$Ln!#O#wqbKATI;zG{4JEYD%V*Ex?l
zdeM}}<>1HVKu_Odk*|Gy<GTXbUX-}3^n9lJ*xf}k`FtpHJ0jo6CWo+#Ly^}JdA>Oh
z{f^Lo@og=fc7;=u1hWH+_PFPZ#@XF4{Fy;^195C$3wxnVwL|u#H{2k*(Rfp1vm1}^
zdRo|x#~aVIup5u(Ia=6_$DipNhtrG5aOCtO(i}P6h<sPO<n$ui+3ZK8{dBq!`M!3^
z@s17mdqPdJ(~Zc#Fv4RIx#MF(`<U=B*<AO3P;{7U?qROEy+|fT=pW*GaZL4cxi2#f
ze|WCH<CWf5rc=4rhth7#H=po@>hgQ+;<-b1gICXVr*K}#TZP+`d*fCy>5tg$wEp&M
zhU^p%eT6$|kF4HfS=Mm{u9j{#_1#tVnU{ATK4@88rUi{w*bOz^sx!7H));@@z2nY*
z?b+NJYaE`z>g-dPTfDpT&<q!;5~&8WI^JQbu4uHe@eO+yHP>>4AG$ppQW3|-H=eRp
zuYrzOOzTvxBo%Ayx9RSPRFI~+^m^)5(-?L-Yf0Dm_4}Zbb@aSKn_yliop)co)*&DK
zp>6Ss>x-Z=RCeOrb=0e-|L;WoYFn*i$6+GLx>Ddylc`mvmsT~a!~WE=$JW@z6J$Q|
z=B}%;>cS)x;trcpkUYa0Qq)o_v*=>gbW&BFDa>^2*xrO)uppdyEDmRsO`HqI4r2T_
Wnc0JrH~*ged-6a2I`gW|5C8zcvUb}5

delta 1779
zcmV<P1`PT74fzcSABzYG<xApQkq8`rZExE)5dPf!ii5u-4Jg}^9LqMk4N3ZzZSB%H
zY1d4HK}od5MQ@`hyFt<azN1LXQtW(~U>MTf!{gn<Baip}=tVE0k}gUn7ZEE%!3)Xr
ztT%)2M=xIVc!X~~yGPVVMpLFfX_hgXXxW;HQsaA$mUON3PEXG(8Rz2kbOup>o>9pn
z2p-`zUGoU$l8J<8E1+2fUj%)Giz<^mV*q}3dOFW55i&LMuUzmI&u9WCcb8W$|2~PO
zEN1?|A8a<8eqOLFP5QZ5`Gd$GI4{8hZQ(j$v}6!+S({NpKf8^23HYN>2v&$ZWMzq7
znNT2Kf{Yowq7VzV{K@YrlYZ}iXC3w{e?Wf*#TiltC)+{x%RZbZ38cI%RX}T+R1BvO
z45VOn)Ktxw<!K#8t%nVlF(fo#2^{}&+=r7-RW5PZG8YggRB-u-b}v{+D~v)Kzq4Gz
zD$gTypN5i$jA+%?DfP;fX07J!!yS%o`~;qrlF>*JVfW#d%Y><rU*q6^0Ef@uypB@^
zp=5rBoPpB^y%=znp>ru!%lm&8w6E>BeYmEjM3?9ar>e4tEZ<~WH|oO;OW2xbvNH>+
z@Ubj-^Q}l|W~RP^vMLJ}N*((K*-h9|DpzjuvV>0+&k&8_f<2vhKoyW_TlkES_zyIU
zRj&QcsK}JlUvlwK#(B|y2e>+pAf{_1J%=!6;e%eQU)gdA^Ds_$nljN?Lr|1g0Wztv
z$|V!=Ssy-DX~2Xc`3kwG8RpvzCew)qjN5>7DFqKIsSxg@iF^!Uu4t_Cs!SfOI(ec-
z>tc|St(bVKaHiXQ+iCFU^>_0eU>YzjmNezb;|!X|vdFs&^6oc(@@`IUKa%@%a{rdx
zzbE$}$o<@m!z97o6jh<L<YU3=(U%0?6L@D3xcr!WENK}Ai`<ZNaz}n6AIO}1A)gHv
z(;=iHP;1>hI~EKG*^Q})GTgzlXF<rYWNE>~Lgb0c&Y7C^)@YFp91(GyR$HIu=+mDD
zG3i6iQNR=2kTZjSu!%!lG9qUNHC3?*maSy`tSk1gk1LW{nEcueZR(@ZBJpJ<mNaD6
zl4rbBhOUto%jlh*{@;pfXDsrXydiIiq1Z->nL4PGmb-h-+DL8?wpRh4ugyHjYOs7(
zi*yF^SuN5TZ2UpW+K7u*5XfebJUbj&OrI!sS_kP1W07}%5SNV>=_}&A><qtdv`Aiv
z*%Y<aJIhF0O2wKV)*@Z(TOpIfTBK{qgvbwT$11;s=y<i$D!OoI`hx%qTrB;;8qXqs
z5MteGC;kO)lFwupDw3E~D&1y>j3SBuR3cxdhqNw|SUyX!t4Mrat1>XAMQ3p@|G4^{
z!2JE2H)czJxkx6SfZ;<M^>iz^$mxbht<7=>gw!T1>-w_?LWXnxB&96!b-S%x#c|%D
z3Sfv28J?-XvHJ7ILxV<Y3mQJu`_pV3t-*qf_k<mF&7LB63ecRow|IFtwvO9Z04&D1
zUz0~T#@fB0MUn83V!3EQf?{zJ$QsR>qH=8R5(|xgIffHeR7<Q!Sk{8hy&_>Nyi-IR
zZ(dC9irl_&1;z@uQPoaea%fO8tVgQ!&}O_I_tF$F<l}?6)a<Ip=i2fN=6sd$B}Xsn
z^62+|@q2pu28(>{>Koq{z~-XFWvS;g)yHlxlF8?Nk=qh^MmE}qo$ZUpEz#IB=b_gU
zI-ae6g_E{$Vv=CGXVD&Ze9<_&9fmhG$aWx(?P*~rl!<o89(9HrWILMi#Mo@d<GEuk
zY{xSmPPMQd&v@);VLKjgs&5=lCnCd<(~U@T<g_F5T<wz6iD+xH8<F<YX-DLF+9hXv
zXt>uAYLcCHMBbSZ9*W2v9unGzg!{?ny8EJk{akbRbIt8UGBQGMAJ>UvqL<5knrZmM
zbNw|h^}aHl%B4P(wp+gbgpX90-$NJAEwUZFdL~<i^F-b#+>U%aY!svJi0w}6ZohiS
zR&m!=xTEIC>O7WZ9ai9K>1I>kT~(iXe)IONmQ_WP(`bR+P~ELMLwjTm@i)G8+?hRp
zojF5|!!uZ&eF}4pcXu9|;UZBY)nJx~J51FT4c0ckVeg{m@(1{#+rvH;%cb#+Cv4Ga
zpd+@VRU#LXiWT<TbbCZ9NL^hzJ@uk)3|pPGq$~XTt5?b@dR(AQFt4N5yQf~OkoW%9
zws^_)MNk<kT5)bW>P6lEx1xTwt=8jWFp*?cDsZdG)GE_UtD4nfe`47~Yv|$$G8>I&
zuB)-?!Xy;p2AffkJjEK4S5hm}=xkAUQe~AY%*YL#DRo&eoU#etf2VYE=A4CYIB}7q
VaMC+^@#E;n(SJsQd~41S002yne&qlF

diff --git a/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz b/sklearn/datasets/tests/data/openml/292/api-v1-json-data-292.json.gz
index 3313b6fb4a0a26fd6c4561e80f51b93b4b7390b3..17caf8a8dcc7b1b33bca8f5dfd6d90b66a6b2e3a 100644
GIT binary patch
delta 39
ucmZ3?vYbUuzMF%?(0^SV19M_Qrf!*`ZdP%Ao^DEFNusWirO`$yP9^~0L<<%G

delta 35
qcmZ3^vY16mzMF%CnJ+n(fjK3yBr!fEwK%ybv!En1KX0QPCldgznhJ6N

diff --git a/sklearn/datasets/tests/data/openml/292/api-v1-json-data-40981.json.gz b/sklearn/datasets/tests/data/openml/292/api-v1-json-data-40981.json.gz
index 888140f92b36025bdaf00eba9eeb917fd9c3964e..9fd04cf4a73cb3799757bb1c677df329708ed35b 100644
GIT binary patch
literal 553
zcmV+^0@nQ>iwFoeQm$SA17UD!Ep{<2YIARHEo5PIVJ$Q;IXE#cYIARH098`kirX*{
zeNMhYxNl(-+gdw~{g#)7LJLc?T_|-iV_A;H%97C~Azk+0D_LosfH7xgj^=V^-j|E0
z1p~FFrf#Vwij9Uz5tZnDxmZN3btu9Gjvl-zV4&Wyx7sM+3`FpssM6t8lm^dzu4uK|
zThmLmT9xQvz;Jo2;;*bZynaKd606lgT1BW2{}69No&{PXoTS-0P1k7_IM1Adrid^z
z5xuh#FFn)9eWs_VgF#x2=I)T6&5ibe6g7&`6K!v^c1jM@LmoQOF7u`mUko~fMnbB=
zW%)zfjV}~B{NFyR;+sO1_=IGK2nVgHE)l*ueEkw_i{xr|6pU4#k!C@t&Q~#Bqr(VF
z(|Z5?=_!PWa%`!l*u7@gxa6D?FLCYaOpBR^3luwY3GID2*{#$3CEb=uQsQ0UaYm>h
z6X)4B&bmHV+`D>XMqO5M6`#-NjU1^M_(m$%j-RiJPpLz0XTDYyXT=WVDxSsJ=r9?m
zuDcG+IIY8NP3(U+(}CrtHlU8~9KpDiH8ynqGf)g1o%JgkE~al>0Va;MyEqhSW82Fe
z%x8>~89!z-&TjZQp7H1CcQ|XNS;IIp7vI8oBaC+Yu-bcD3T_1_CvY1=Lz~y6r{qoB
rVbYuc-lagGDDc+(A2HcM-gI44<ayUXv|M~FKbC(1Eil2YiUR-ub94wI

literal 547
zcmV+;0^I!{iwFp5lS^9w17u-zVP9lrb7OL8aCB*JZZ2wbZ*BlpQca87Fc3W_ze2b-
zvxzOmP7~j9StzuyG~0zz6(h@XELN6`J`%Re{`X2&S|^~xo0+GXHy`iIMbZ*WYQt>Z
zGD9^VEmtzB!25EsNO<c}g7zqR2&N<h3yy;`RujRAjQ%szCjLrb^xWr4()8eLuXLJL
z;An{z>V&heyg9yn0|3i3Ju0Uu3*jH~RjP{wD}ZuXtRP&$B66MtBZdLW?L-VtET4OB
zfPc1Upd*8F2F&U}=*_qG05r3jvomXNvvyhy(?=1N7@v8K<rfPs!~m)E;IsT;?K%{S
z3jfOoIJ+r?<tLyzKm{>|nF^q*Bh;_JcAsDEjuGp0V89{{stZ+Yn9*@0+OYcY{qZq|
zsP=57r`Y|>u5l#<qhZ9guQM%XZZ1;XC=_Y$!})Fn%V)T)^1MR3$P=70Nhi*eYrJ*+
zu0?S5){dsa8O|=3%UX?04q~memlOKK*%^A|cNS{ltk~}m!r45WwTYb(-F4TdxnOm?
zt%?2bW;)`8tu4_fe~!fZl{GFZ{~0NkoV@ib8QyH)_ySy>YJYJcG1j$jcd(o>%4fVO
zW?bCxOElxp$?tg9-0+4AZr_50@n$f_@57nkZ5i<^IQs#&F*LMgO?yUPjT-`?fn~Wb
lXonao_C?u1RKVSKvukKcQMrN1a`CbJ_zTS2bE$~~005Kz4Z;8b


From d00a4137110cef2167610767d97125340b1300c6 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sun, 3 May 2020 01:32:07 -0400
Subject: [PATCH 13/25] Linting updates

---
 sklearn/datasets/_openml.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index c5a8da376403d..b92f1470f4690 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -471,8 +471,8 @@ def _stream_checksum_generator(response):
             if actual_md5_checksum.hexdigest() != md5_checksum:
                 raise ValueError("md5 checksum of local file for " + url
                                  + " does not match description. "
-                                 "Downloaded file could have been modified / corrupted,"
-                                 "clean cache and retry...")
+                                 "Downloaded file could have been modified / "
+                                 "corrupted, clean cache and retry...")
 
         arff = _arff.load(_stream_checksum_generator(response),
                           return_type=return_type,
@@ -480,8 +480,8 @@ def _stream_checksum_generator(response):
         return parse_arff(arff)
 
 
-def _download_data_to_bunch(url, sparse, data_home, *, as_frame, features_list, data_columns, target_columns, shape,
-                            md5_checksum):
+def _download_data_to_bunch(url, sparse, data_home, *, as_frame, features_list,
+                            data_columns, target_columns, shape, md5_checksum):
     """Download OpenML ARFF and convert to Bunch of data"""
     # NB: this function is long in order to handle retry for any failure
     #     during the streaming parse of the ARFF.
@@ -831,7 +831,8 @@ def fetch_openml(name=None, *, version='active', data_id=None, data_home=None,
                                     features_list=features_list, shape=shape,
                                     target_columns=target_columns,
                                     data_columns=data_columns,
-                                    md5_checksum=data_description["md5_checksum"])
+                                    md5_checksum=data_description[
+                                        "md5_checksum"])
 
     if return_X_y:
         return bunch.data, bunch.target

From 2e744aafdc7c7185c7af7b05a260b0bed0761867 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sun, 3 May 2020 21:23:11 -0400
Subject: [PATCH 14/25] Fully consume generator, test for non frame case

---
 sklearn/datasets/_openml.py           |  6 +-----
 sklearn/datasets/tests/test_openml.py | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index b92f1470f4690..fcf3e0a66759e 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -246,12 +246,8 @@ def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None):
     """
     arff_data = arff['data']
     if isinstance(arff_data, Generator):
-        if shape[0] == -1:
-            count = -1
-        else:
-            count = shape[0] * shape[1]
         data = np.fromiter(itertools.chain.from_iterable(arff_data),
-                           dtype='float64', count=count)
+                           dtype='float64')
         data = data.reshape(*shape)
         X = data[:, col_slice_x]
         y = data[:, col_slice_y]
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 8187dc3feb9db..92ad6529e1467 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1193,7 +1193,13 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
     assert 'animal' not in dataset['feature_names']
 
 
-def test_fetch_openml_verify_checksum(monkeypatch):
+@pytest.mark.parametrize('as_frame,cache', [
+    (True, True),
+    (True, False),
+    (False, True),
+    (False, False)
+])
+def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache):
     data_id = 2
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
 
@@ -1212,11 +1218,13 @@ def test_fetch_openml_verify_checksum(monkeypatch):
             modified_gzip.write(data)
 
         # should fail checksum validation
+        if as_frame:
+            pytest.importorskip('pandas')
         with pytest.raises(ValueError) as exc:
-            sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
-                                          as_frame=True)
+            sklearn.datasets.fetch_openml(data_id=data_id, cache=cache,
+                                          as_frame=as_frame)
         # exception message should have file-path
-        assert "1666876" in repr(exc)
+        assert "1666876" in str(exc)
     finally:
         shutil.copy(backup_data_path, original_data_path)
         os.remove(backup_data_path)

From 1e6efdd44585cb8f6731dd69bb72966db96746ac Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sun, 3 May 2020 22:03:17 -0400
Subject: [PATCH 15/25] Cross platform assert in test

---
 sklearn/datasets/tests/test_openml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 92ad6529e1467..7955efdda8e39 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1224,7 +1224,7 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache):
             sklearn.datasets.fetch_openml(data_id=data_id, cache=cache,
                                           as_frame=as_frame)
         # exception message should have file-path
-        assert "1666876" in str(exc)
+        assert exc.match("1666876")
     finally:
         shutil.copy(backup_data_path, original_data_path)
         os.remove(backup_data_path)

From bf937a3cb1883ea73d41729c589a7d5b2caa7b8c Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Mon, 25 May 2020 18:31:52 -0400
Subject: [PATCH 16/25] Intentionally reach end-of-stream checksum validation

---
 sklearn/datasets/_openml.py | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index fcf3e0a66759e..454df78ab4646 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -246,8 +246,12 @@ def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None):
     """
     arff_data = arff['data']
     if isinstance(arff_data, Generator):
+        if shape[0] == -1:
+            count = -1
+        else:
+            count = shape[0] * shape[1]
         data = np.fromiter(itertools.chain.from_iterable(arff_data),
-                           dtype='float64')
+                           dtype='float64', count=count)
         data = data.reshape(*shape)
         X = data[:, col_slice_x]
         y = data[:, col_slice_y]
@@ -463,17 +467,26 @@ def _stream_checksum_generator(response):
             for line in response:
                 actual_md5_checksum.update(line)
                 yield line.decode('utf-8')
-            # stream consumed, check md5
-            if actual_md5_checksum.hexdigest() != md5_checksum:
-                raise ValueError("md5 checksum of local file for " + url
-                                 + " does not match description. "
-                                 "Downloaded file could have been modified / "
-                                 "corrupted, clean cache and retry...")
-
-        arff = _arff.load(_stream_checksum_generator(response),
+
+        stream = _stream_checksum_generator(response)
+
+        arff = _arff.load(stream,
                           return_type=return_type,
                           encode_nominal=encode_nominal)
-        return parse_arff(arff)
+
+        parsed_arff = parse_arff(arff)
+
+        # consume remaining stream, if early exited
+        for _ in stream:
+            pass
+
+        if actual_md5_checksum.hexdigest() != md5_checksum:
+            raise ValueError("md5 checksum of local file for " + url +
+                             " does not match description. "
+                             "Downloaded file could have been modified / "
+                             "corrupted, clean cache and retry...")
+
+        return parsed_arff
 
 
 def _download_data_to_bunch(url, sparse, data_home, *, as_frame, features_list,

From eac5a1edf290f57662de4bc467921c85dd52b26b Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sat, 6 Jun 2020 13:59:21 -0400
Subject: [PATCH 17/25] Test should not modify local test-suite shared file

---
 sklearn/datasets/tests/test_openml.py | 124 +++++++++++++-------------
 1 file changed, 64 insertions(+), 60 deletions(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 7955efdda8e39..64dafdded2661 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -8,7 +8,6 @@
 import scipy.sparse
 import sklearn
 import pytest
-import shutil
 
 from sklearn import config_context
 from sklearn.datasets import fetch_openml
@@ -147,6 +146,38 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
     return data_by_id
 
 
+class MockHTTPResponse:
+    def __init__(self, data, is_gzip):
+        self.data = data
+        self.is_gzip = is_gzip
+
+    def read(self, amt=-1):
+        return self.data.read(amt)
+
+    def tell(self):
+        return self.data.tell()
+
+    def seek(self, pos, whence=0):
+        return self.data.seek(pos, whence)
+
+    def close(self):
+        self.data.close()
+
+    def info(self):
+        if self.is_gzip:
+            return {'Content-Encoding': 'gzip'}
+        return {}
+
+    def __iter__(self):
+        return iter(self.data)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        return False
+
+
 def _monkey_patch_webbased_functions(context,
                                      data_id,
                                      gzip_response):
@@ -161,37 +192,6 @@ def _monkey_patch_webbased_functions(context,
     path_suffix = '.gz'
     read_fn = gzip.open
 
-    class MockHTTPResponse:
-        def __init__(self, data, is_gzip):
-            self.data = data
-            self.is_gzip = is_gzip
-
-        def read(self, amt=-1):
-            return self.data.read(amt)
-
-        def tell(self):
-            return self.data.tell()
-
-        def seek(self, pos, whence=0):
-            return self.data.seek(pos, whence)
-
-        def close(self):
-            self.data.close()
-
-        def info(self):
-            if self.is_gzip:
-                return {'Content-Encoding': 'gzip'}
-            return {}
-
-        def __iter__(self):
-            return iter(self.data)
-
-        def __enter__(self):
-            return self
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            return False
-
     def _file_name(url, suffix):
         return (re.sub(r'\W', '-', url[len("https://openml.org/"):])
                 + suffix + path_suffix)
@@ -1193,38 +1193,42 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
     assert 'animal' not in dataset['feature_names']
 
 
-@pytest.mark.parametrize('as_frame,cache', [
-    (True, True),
-    (True, False),
-    (False, True),
-    (False, False)
-])
+@pytest.mark.parametrize('as_frame', [True, False])
 def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache):
     data_id = 2
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
 
-    # modify a mocked file content to change checksum
+    # create a modified (local) arff file
     dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id))
     original_data_path = os.path.join(dataset_dir,
                                       'data-v1-download-1666876.arff.gz')
-    backup_data_path = original_data_path + ".back"
-    shutil.copy(original_data_path, backup_data_path)
-
-    try:
-        with gzip.GzipFile(backup_data_path, "rb") as orig_gzip, \
-             gzip.GzipFile(original_data_path, "wb") as modified_gzip:
-            data = bytearray(orig_gzip.read())
-            data[len(data)-1] = 37
-            modified_gzip.write(data)
-
-        # should fail checksum validation
-        if as_frame:
-            pytest.importorskip('pandas')
-        with pytest.raises(ValueError) as exc:
-            sklearn.datasets.fetch_openml(data_id=data_id, cache=cache,
-                                          as_frame=as_frame)
-        # exception message should have file-path
-        assert exc.match("1666876")
-    finally:
-        shutil.copy(backup_data_path, original_data_path)
-        os.remove(backup_data_path)
+    corrupt_copy = original_data_path + ".test_corrupt_arff"
+    with gzip.GzipFile(original_data_path, "rb") as orig_gzip, \
+         gzip.GzipFile(corrupt_copy, "wb") as modified_gzip:
+        data = bytearray(orig_gzip.read())
+        data[len(data)-1] = 37
+        modified_gzip.write(data)
+
+    # simulate request to return modified file
+    mocked_openml_url = sklearn.datasets._openml.urlopen
+
+    def swap_file_mock(request):
+        url = request.get_full_url()
+        if url.endswith('data/v1/download/1666876'):
+            return MockHTTPResponse(open(corrupt_copy, "rb"), is_gzip=True)
+        else:
+            return mocked_openml_url(request)
+
+    monkeypatch.setattr(sklearn.datasets._openml, 'urlopen', swap_file_mock)
+
+    # validate failed checksum
+    if as_frame:
+        pytest.importorskip('pandas')
+    with pytest.raises(ValueError) as exc:
+        sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
+                                      as_frame=as_frame)
+    # exception message should have file-path
+    assert exc.match("1666876")
+
+    # cleanup fake local file
+    os.remove(corrupt_copy)

From d075a830cfa6b16e704a7c92823a78f2fbaa36fa Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sat, 6 Jun 2020 21:53:33 -0400
Subject: [PATCH 18/25] Update sklearn/datasets/tests/test_openml.py

flake8

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/datasets/tests/test_openml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 64dafdded2661..0e3e63925c139 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1204,7 +1204,7 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache):
                                       'data-v1-download-1666876.arff.gz')
     corrupt_copy = original_data_path + ".test_corrupt_arff"
     with gzip.GzipFile(original_data_path, "rb") as orig_gzip, \
-         gzip.GzipFile(corrupt_copy, "wb") as modified_gzip:
+            gzip.GzipFile(corrupt_copy, "wb") as modified_gzip:
         data = bytearray(orig_gzip.read())
         data[len(data)-1] = 37
         modified_gzip.write(data)

From 23ba1905e2ab37c0321e5e3f3ff7f809870047b8 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sat, 6 Jun 2020 22:39:48 -0400
Subject: [PATCH 19/25] Use tmpdir for creating corrupt file, add comments,
 update changelog

---
 doc/whats_new/v0.24.rst               |  8 +++++---
 sklearn/datasets/tests/test_openml.py | 10 ++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
index dd4ab30a7f2ff..6a968d51871a5 100644
--- a/doc/whats_new/v0.24.rst
+++ b/doc/whats_new/v0.24.rst
@@ -44,9 +44,11 @@ Changelog
     :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
     where 123456 is the *pull request* number, not the issue number.
 
-:mod:`sklearn.module`
-.....................
-
+:mod:`sklearn.datasets`
+.......................
+- |Feature| :func:`datasets.fetch_openml` now validates md5checksum of arff
+  files downloaded or cached to ensure data integrity.
+  :pr:`14800` by :user:`Shashank Singh <shashanksingh28>` and `Joel Nothman`_.
 
 Code and Documentation Contributors
 -----------------------------------
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 64dafdded2661..a55c0a70f1d59 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1194,22 +1194,24 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
 
 
 @pytest.mark.parametrize('as_frame', [True, False])
-def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache):
+def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir):
     data_id = 2
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
 
-    # create a modified (local) arff file
+    # create a temporary modified arff file
     dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id))
     original_data_path = os.path.join(dataset_dir,
                                       'data-v1-download-1666876.arff.gz')
-    corrupt_copy = original_data_path + ".test_corrupt_arff"
+    corrupt_copy = tmpdir + "/test_invalid_checksum.arff"
     with gzip.GzipFile(original_data_path, "rb") as orig_gzip, \
          gzip.GzipFile(corrupt_copy, "wb") as modified_gzip:
         data = bytearray(orig_gzip.read())
         data[len(data)-1] = 37
         modified_gzip.write(data)
 
-    # simulate request to return modified file
+    # Requests are already mocked by monkey_patch_webbased_functions.
+    # We want to re-use that mock for all requests except file download,
+    # hence creating a thin mock over the original mock
     mocked_openml_url = sklearn.datasets._openml.urlopen
 
     def swap_file_mock(request):

From c2161712b00dcf0181befee88402448bf87922ee Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sat, 6 Jun 2020 23:15:03 -0400
Subject: [PATCH 20/25] Make test-path platform independent

---
 sklearn/datasets/tests/test_openml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 5a880811771bf..1d1341989f740 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1202,7 +1202,7 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir):
     dataset_dir = os.path.join(currdir, 'data', 'openml', str(data_id))
     original_data_path = os.path.join(dataset_dir,
                                       'data-v1-download-1666876.arff.gz')
-    corrupt_copy = tmpdir + "/test_invalid_checksum.arff"
+    corrupt_copy = os.path.join(tmpdir, "test_invalid_checksum.arff")
     with gzip.GzipFile(original_data_path, "rb") as orig_gzip, \
             gzip.GzipFile(corrupt_copy, "wb") as modified_gzip:
         data = bytearray(orig_gzip.read())

From 536bc4f683518ca78c2998ce22c46bb3a71c742a Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sun, 7 Jun 2020 10:40:51 -0400
Subject: [PATCH 21/25] Do not remove file explicitly from tmpdir

---
 sklearn/datasets/tests/test_openml.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 1d1341989f740..b42551ba2fdfe 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1231,6 +1231,3 @@ def swap_file_mock(request):
                                       as_frame=as_frame)
     # exception message should have file-path
     assert exc.match("1666876")
-
-    # cleanup fake local file
-    os.remove(corrupt_copy)

From c55f64a2af78d953de8dfa00f893bbfecc28b120 Mon Sep 17 00:00:00 2001
From: Shashank Singh <shashanksingh28@gmail.com>
Date: Sun, 7 Jun 2020 18:17:34 -0400
Subject: [PATCH 22/25] Make test mock class private to ignore coverage

---
 sklearn/datasets/tests/test_openml.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index b42551ba2fdfe..04e65c960e4c2 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -146,7 +146,7 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
     return data_by_id
 
 
-class MockHTTPResponse:
+class _MockHTTPResponse:
     def __init__(self, data, is_gzip):
         self.data = data
         self.is_gzip = is_gzip
@@ -154,12 +154,6 @@ def __init__(self, data, is_gzip):
     def read(self, amt=-1):
         return self.data.read(amt)
 
-    def tell(self):
-        return self.data.tell()
-
-    def seek(self, pos, whence=0):
-        return self.data.seek(pos, whence)
-
     def close(self):
         self.data.close()
 
@@ -204,10 +198,10 @@ def _mock_urlopen_data_description(url, has_gzip_header):
 
         if has_gzip_header and gzip_response:
             fp = open(path, 'rb')
-            return MockHTTPResponse(fp, True)
+            return _MockHTTPResponse(fp, True)
         else:
             fp = read_fn(path, 'rb')
-            return MockHTTPResponse(fp, False)
+            return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen_data_features(url, has_gzip_header):
         assert url.startswith(url_prefix_data_features)
@@ -215,10 +209,10 @@ def _mock_urlopen_data_features(url, has_gzip_header):
                             _file_name(url, '.json'))
         if has_gzip_header and gzip_response:
             fp = open(path, 'rb')
-            return MockHTTPResponse(fp, True)
+            return _MockHTTPResponse(fp, True)
         else:
             fp = read_fn(path, 'rb')
-            return MockHTTPResponse(fp, False)
+            return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen_download_data(url, has_gzip_header):
         assert (url.startswith(url_prefix_download_data))
@@ -228,10 +222,10 @@ def _mock_urlopen_download_data(url, has_gzip_header):
 
         if has_gzip_header and gzip_response:
             fp = open(path, 'rb')
-            return MockHTTPResponse(fp, True)
+            return _MockHTTPResponse(fp, True)
         else:
             fp = read_fn(path, 'rb')
-            return MockHTTPResponse(fp, False)
+            return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen_data_list(url, has_gzip_header):
         assert url.startswith(url_prefix_data_list)
@@ -248,10 +242,10 @@ def _mock_urlopen_data_list(url, has_gzip_header):
 
         if has_gzip_header:
             fp = open(json_file_path, 'rb')
-            return MockHTTPResponse(fp, True)
+            return _MockHTTPResponse(fp, True)
         else:
             fp = read_fn(json_file_path, 'rb')
-            return MockHTTPResponse(fp, False)
+            return _MockHTTPResponse(fp, False)
 
     def _mock_urlopen(request):
         url = request.get_full_url()
@@ -1217,7 +1211,7 @@ def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir):
     def swap_file_mock(request):
         url = request.get_full_url()
         if url.endswith('data/v1/download/1666876'):
-            return MockHTTPResponse(open(corrupt_copy, "rb"), is_gzip=True)
+            return _MockHTTPResponse(open(corrupt_copy, "rb"), is_gzip=True)
         else:
             return mocked_openml_url(request)
 

From fc9181dfe73285921841f488e13e84032c3142df Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 25 Jun 2020 21:38:26 +0200
Subject: [PATCH 23/25] Fix merge conflict issues

---
 sklearn/datasets/_openml.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 58d199117919c..d2ce5d471f8f7 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -488,6 +488,7 @@ def _get_num_samples(data_qualities: OpenmlQualitiesType) -> int:
     qualities = {d['name']: d['value'] for d in data_qualities}
     return int(float(qualities.get('NumberOfInstances', default_n_samples)))
 
+
 def _load_arff_response(
     url: str,
     data_home: Optional[str],
@@ -515,7 +516,7 @@ def _stream_checksum_generator(response):
                           encode_nominal=encode_nominal)
 
         parsed_arff = parse_arff(arff)
- 
+
         # consume remaining stream, if early exited
         for _ in stream:
             pass
@@ -538,7 +539,8 @@ def _download_data_to_bunch(
     features_list: List,
     data_columns: List[int],
     target_columns: List,
-    shape: Optional[Tuple[int, int]]
+    shape: Optional[Tuple[int, int]],
+    md5_checksum: str
 ):
     """Download OpenML ARFF and convert to Bunch of data
     """

From 07a53b6064753c1ac0a1a0f3881f454458409573 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 25 Jun 2020 21:43:12 +0200
Subject: [PATCH 24/25] fmt

---
 sklearn/datasets/tests/test_openml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 8157f96eee769..687c22cdbfd37 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1265,4 +1265,4 @@ def test_convert_arff_data_type():
     }
     msg = r"arff\['data'\] must be a generator when converting to pd.DataFrame"
     with pytest.raises(ValueError, match=msg):
-        _convert_arff_data_dataframe(arff, ['a'], {})
\ No newline at end of file
+        _convert_arff_data_dataframe(arff, ['a'], {})

From d5bebcf7a2d708905f31a83607817ad12925a8d9 Mon Sep 17 00:00:00 2001
From: "Thomas J. Fan" <thomasjpfan@gmail.com>
Date: Thu, 25 Jun 2020 20:03:49 -0400
Subject: [PATCH 25/25] CLN Early skip if pandas is not avaliable

---
 sklearn/datasets/tests/test_openml.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 687c22cdbfd37..4e1220807947f 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1206,6 +1206,9 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
 
 @pytest.mark.parametrize('as_frame', [True, False])
 def test_fetch_openml_verify_checksum(monkeypatch, as_frame, cache, tmpdir):
+    if as_frame:
+        pytest.importorskip('pandas')
+
     data_id = 2
     _monkey_patch_webbased_functions(monkeypatch, data_id, True)
 
@@ -1235,8 +1238,6 @@ def swap_file_mock(request):
     monkeypatch.setattr(sklearn.datasets._openml, 'urlopen', swap_file_mock)
 
     # validate failed checksum
-    if as_frame:
-        pytest.importorskip('pandas')
     with pytest.raises(ValueError) as exc:
         sklearn.datasets.fetch_openml(data_id=data_id, cache=False,
                                       as_frame=as_frame)