From 7692122a8b26c6b1913cdeb3c93ec271a582377d Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 20:18:45 +0200
Subject: [PATCH 01/18] Type annotations for OpenML fetcher

---
 azure-pipelines.yml               |   1 +
 sklearn/cluster/_agglomerative.py |   3 +-
 sklearn/datasets/_base.py         |   2 +-
 sklearn/datasets/_openml.py       | 138 ++++++++++++++++++++++--------
 sklearn/externals/_arff.py        |  18 +++-
 sklearn/manifold/_t_sne.py        |   3 +-
 sklearn/utils/validation.py       |   3 +-
 7 files changed, 124 insertions(+), 44 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 196d4ca34f434..3e7b46039dfc4 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -33,6 +33,7 @@ jobs:
         fi
       displayName: Run linting
     - bash: |
+        set -ex
         if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[lint\ skip\] ]]; then
           # skip linting
           echo "Skipping linting"
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 92246141d6fe8..94c057fbea09d 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -21,7 +21,8 @@
 from ..neighbors import DistanceMetric
 from ..neighbors._dist_metrics import METRIC_MAPPING
 
-from . import _hierarchical_fast as _hierarchical
+# error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast'
+from . import _hierarchical_fast as _hierarchical  # type: ignore
 from ._feature_agglomeration import AgglomerationTransform
 from ..utils._fast_dict import IntFloatDict
 from ..utils.fixes import _astype_copy_false
diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
index 9737a5f67891a..f88b7fbc81ee5 100644
--- a/sklearn/datasets/_base.py
+++ b/sklearn/datasets/_base.py
@@ -26,7 +26,7 @@
                                 ['filename', 'url', 'checksum'])
 
 
-def get_data_home(data_home=None):
+def get_data_home(data_home=None) -> str:
     """Return the path of the scikit-learn data dir.
 
     This folder is used by some large dataset loaders to avoid downloading the
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 10f40dc8906bf..cb36734144f52 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -6,6 +6,7 @@
 from warnings import warn
 from contextlib import closing
 from functools import wraps
+from typing import Callable, Optional, Dict, Tuple, List, Any, Union
 import itertools
 from collections.abc import Generator
 from collections import OrderedDict
@@ -17,6 +18,7 @@
 import scipy.sparse
 
 from ..externals import _arff
+from ..externals._arff import ArffDataType
 from . import get_data_home
 from urllib.error import HTTPError
 from ..utils import Bunch
@@ -34,11 +36,16 @@
 _DATA_FILE = "data/v1/download/{}"
 
 
-def _get_local_path(openml_path, data_home):
+ArffQuantileType = List[Dict[str, str]]
+
+
+def _get_local_path(openml_path: str, data_home: str) -> str:
     return os.path.join(data_home, 'openml.org', openml_path + ".gz")
 
 
-def _retry_with_clean_cache(openml_path, data_home):
+def _retry_with_clean_cache(
+    openml_path: str, data_home: Optional[str]
+) -> Callable:
     """If the first call to the decorated function fails, the local cached
     file is removed, and the function is called again. If ``data_home`` is
     ``None``, then the function is called once.
@@ -62,7 +69,7 @@ def wrapper(*args, **kw):
     return decorator
 
 
-def _open_openml_url(openml_path, data_home):
+def _open_openml_url(openml_path: str, data_home: Optional[str]):
     """
     Returns a resource from OpenML.org. Caches it to data_home if required.
 
@@ -103,12 +110,13 @@ def is_gzip(_fsrc):
 
         try:
             with closing(urlopen(req)) as fsrc:
+                opener: Callable
                 if is_gzip(fsrc):
-                    with open(local_path, 'wb') as fdst:
-                        shutil.copyfileobj(fsrc, fdst)
+                    opener = gzip.GzipFile
                 else:
-                    with gzip.GzipFile(local_path, 'wb') as fdst:
-                        shutil.copyfileobj(fsrc, fdst)
+                    opener = open
+                with opener(local_path, 'wb') as fdst:
+                    shutil.copyfileobj(fsrc, fdst)
         except Exception:
             if os.path.exists(local_path):
                 os.unlink(local_path)
@@ -119,8 +127,12 @@ def is_gzip(_fsrc):
     return gzip.GzipFile(local_path, 'rb')
 
 
-def _get_json_content_from_openml_api(url, error_message, raise_if_error,
-                                      data_home):
+def _get_json_content_from_openml_api(
+    url: str,
+    error_message: Optional[str],
+    raise_if_error: bool,
+    data_home: Optional[str]
+) -> Optional[Dict]:
     """
     Loads json data from the openml api
 
@@ -170,7 +182,9 @@ def _load_json():
     return None
 
 
-def _split_sparse_columns(arff_data, include_columns):
+def _split_sparse_columns(
+    arff_data: ArffDataType, include_columns: List
+) -> ArffDataType:
     """
     obtains several columns from sparse arff representation. Additionally, the
     column indices are re-labelled, given the columns that are not included.
@@ -192,7 +206,7 @@ def _split_sparse_columns(arff_data, include_columns):
         Subset of arff data with only the include columns indicated by the
         include_columns argument.
     """
-    arff_data_new = (list(), list(), list())
+    arff_data_new: ArffDataType = (list(), list(), list())
     reindexed_columns = {column_idx: array_idx for array_idx, column_idx
                          in enumerate(include_columns)}
     for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
@@ -203,7 +217,9 @@ def _split_sparse_columns(arff_data, include_columns):
     return arff_data_new
 
 
-def _sparse_data_to_array(arff_data, include_columns):
+def _sparse_data_to_array(
+    arff_data: ArffDataType, include_columns: List
+) -> np.ndarray:
     # turns the sparse data back into an array (can't use toarray() function,
     # as this does only work on numeric data)
     num_obs = max(arff_data[1]) + 1
@@ -218,7 +234,12 @@ def _sparse_data_to_array(arff_data, include_columns):
     return y
 
 
-def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None):
+def _convert_arff_data(
+    arff: Dict[str, Any],
+    col_slice_x: List[int],
+    col_slice_y: List[int],
+    shape: Optional[Tuple] =None
+) -> Tuple:
     """
     converts the arff object into the appropriate matrix type (np.array or
     scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
@@ -244,6 +265,10 @@ def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None):
     """
     arff_data = arff['data']
     if isinstance(arff_data, Generator):
+        if shape is None:
+            raise ValueError(
+                "shape must be provided when arr['data'] is a Generator"
+            )
         if shape[0] == -1:
             count = -1
         else:
@@ -269,7 +294,7 @@ def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None):
         raise ValueError('Unexpected Data Type obtained from arff.')
 
 
-def _feature_to_dtype(feature):
+def _feature_to_dtype(feature: Dict[str, str]):
     """Map feature to dtype for pandas DataFrame
     """
     if feature['data_type'] == 'string':
@@ -286,7 +311,9 @@ def _feature_to_dtype(feature):
     raise ValueError('Unsupported feature: {}'.format(feature))
 
 
-def _convert_arff_data_dataframe(arff, columns, features_dict):
+def _convert_arff_data_dataframe(
+    arff, columns: List, features_dict: Dict[str, Any]
+) -> Tuple:
     """Convert the ARFF object into a pandas DataFrame.
 
     Parameters
@@ -333,7 +360,9 @@ def _convert_arff_data_dataframe(arff, columns, features_dict):
     return (df, )
 
 
-def _get_data_info_by_name(name, version, data_home):
+def _get_data_info_by_name(
+    name: str, version: Union[int, str], data_home: Optional[str]
+):
     """
     Utilizes the openml dataset listing api to find a dataset by
     name/version
@@ -365,8 +394,11 @@ def _get_data_info_by_name(name, version, data_home):
         # situation in which we return the oldest active version
         url = _SEARCH_NAME.format(name) + "/status/active/"
         error_msg = "No active dataset {} found.".format(name)
-        json_data = _get_json_content_from_openml_api(url, error_msg, True,
-                                                      data_home)
+        json_data = _get_json_content_from_openml_api(
+            url, error_msg, raise_if_error=True, data_home=data_home
+        )
+        # json_data should never be None with raise_if_error=True
+        assert json_data is not None
         res = json_data['data']['dataset']
         if len(res) > 1:
             warn("Multiple active versions of the dataset matching the name"
@@ -377,8 +409,9 @@ def _get_data_info_by_name(name, version, data_home):
 
     # an integer version has been provided
     url = (_SEARCH_NAME + "/data_version/{}").format(name, version)
-    json_data = _get_json_content_from_openml_api(url, None, False,
-                                                  data_home)
+    json_data = _get_json_content_from_openml_api(
+        url, error_message=None, raise_if_error=False, data_home=data_home
+    )
     if json_data is None:
         # we can do this in 1 function call if OpenML does not require the
         # specification of the dataset status (i.e., return datasets with a
@@ -387,38 +420,56 @@ def _get_data_info_by_name(name, version, data_home):
         url += "/status/deactivated"
         error_msg = "Dataset {} with version {} not found.".format(name,
                                                                    version)
-        json_data = _get_json_content_from_openml_api(url, error_msg, True,
-                                                      data_home)
+        json_data = _get_json_content_from_openml_api(
+            url, error_msg, raise_if_error=True, data_home=data_home
+        )
+        # json_data should never be None with raise_if_error=True
+        assert json_data is not None
 
     return json_data['data']['dataset'][0]
 
 
-def _get_data_description_by_id(data_id, data_home):
+def _get_data_description_by_id(
+    data_id: int, data_home: Optional[str]
+) -> Dict[str, Any]:
     # OpenML API function: https://www.openml.org/api_docs#!/data/get_data_id
     url = _DATA_INFO.format(data_id)
     error_message = "Dataset with data_id {} not found.".format(data_id)
-    json_data = _get_json_content_from_openml_api(url, error_message, True,
-                                                  data_home)
+    json_data = _get_json_content_from_openml_api(
+        url, error_message, raise_if_error=True, data_home=data_home
+    )
+    # json_data should never be None with raise_if_error=True
+    assert json_data is not None
     return json_data['data_set_description']
 
 
-def _get_data_features(data_id, data_home):
+def _get_data_features(
+    data_id: int, data_home: Optional[str]
+) -> List[Dict[str, str]]:
     # OpenML function:
     # https://www.openml.org/api_docs#!/data/get_data_features_id
     url = _DATA_FEATURES.format(data_id)
     error_message = "Dataset with data_id {} not found.".format(data_id)
-    json_data = _get_json_content_from_openml_api(url, error_message, True,
-                                                  data_home)
+    json_data = _get_json_content_from_openml_api(
+        url, error_message, raise_if_error=True, data_home=data_home
+    )
+    # json_data should never be None with raise_if_error=True
+    assert json_data is not None
     return json_data['data_features']['feature']
 
 
-def _get_data_qualities(data_id, data_home):
+def _get_data_qualities(
+    data_id: int, data_home: Optional[str]
+) -> Optional[ArffQuantileType]:
     # OpenML API function:
     # https://www.openml.org/api_docs#!/data/get_data_qualities_id
     url = _DATA_QUALITIES.format(data_id)
     error_message = "Dataset with data_id {} not found.".format(data_id)
-    json_data = _get_json_content_from_openml_api(url, error_message, True,
-                                                  data_home)
+    json_data = _get_json_content_from_openml_api(
+        url, error_message, raise_if_error=True, data_home=data_home
+    )
+    # json_data should never be None with raise_if_error=True
+    assert json_data is not None
     try:
         return json_data['data_qualities']['quality']
     except KeyError:
@@ -427,7 +478,7 @@ def _get_data_qualities(data_id, data_home):
         return None
 
 
-def _get_num_samples(data_qualities):
+def _get_num_samples(data_qualities: Optional[ArffQuantileType]) -> int:
     """Get the number of samples from data qualities.
 
     Parameters
@@ -506,7 +557,7 @@ def _download_data_to_bunch(url, sparse, data_home, *,
         parse_arff = partial(_convert_arff_data_dataframe, columns=columns,
                              features_dict=features_dict)
 
-        def postprocess(frame):
+        def postprocess_frame(frame):
             X = frame[data_columns]
             if len(target_columns) >= 2:
                 y = frame[target_columns]
@@ -527,7 +578,7 @@ def parse_arff(arff):
                                   k in data_columns + target_columns}
             return X, y, nominal_attributes
 
-        def postprocess(X, y, nominal_attributes):
+        def postprocess_X_y(X, y, nominal_attributes):
             is_classification = {col_name in nominal_attributes
                                  for col_name in target_columns}
             if not is_classification:
@@ -558,7 +609,10 @@ def postprocess(X, y, nominal_attributes):
                              return_type=return_type,
                              encode_nominal=not as_frame,
                              parse_arff=parse_arff)
-    X, y, frame, nominal_attributes = postprocess(*out)
+    if as_frame:
+        X, y, frame, nominal_attributes = postprocess_frame(*out)
+    else:
+        X, y, frame, nominal_attributes = postprocess_X_y(*out)
 
     return Bunch(data=X, target=y, frame=frame,
                  categories=nominal_attributes,
@@ -608,9 +662,16 @@ def _valid_data_column_names(features_list, target_columns):
     return valid_data_column_names
 
 
-def fetch_openml(name=None, version='active', data_id=None, data_home=None,
-                 target_column='default-target', cache=True, return_X_y=False,
-                 as_frame=False):
+def fetch_openml(
+    name: Optional[str] = None,
+    version: Union[str, int] = 'active',
+    data_id: Optional[int] = None,
+    data_home: Optional[str] = None,
+    target_column: Optional[Union[str, List]] = 'default-target',
+    cache: bool = True,
+    return_X_y: bool = False,
+    as_frame: bool = False
+):
     """Fetch dataset from openml by name or dataset id.
 
     Datasets are uniquely identified by either an integer ID or by a
@@ -800,6 +861,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
     data_columns = _valid_data_column_names(features_list,
                                             target_columns)
 
+    shape: Optional[Tuple[int, int]]
     # determine arff encoding to return
     if not return_sparse:
         # The shape must include the ignored features to keep the right indexes
diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py
index 63a39c3c15d4a..6d0ed3345ca8f 100644
--- a/sklearn/externals/_arff.py
+++ b/sklearn/externals/_arff.py
@@ -148,7 +148,8 @@
                     'joel.nothman@gmail.com')
 __version__ = '2.4.0'
 
-from typing import Optional
+import typing
+from typing import Optional, Tuple, List, Dict, Any
 
 import re
 import sys
@@ -171,6 +172,19 @@
 _RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE)
 _RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]', re.UNICODE)
 
+ArffDataType = Tuple[List, ...]
+
+if typing.TYPE_CHECKING:
+    from typing_extensions import TypedDict
+
+    class ArffContainerType(TypedDict):
+        description: str
+        relation: str
+        attributes: List
+        data: ArffDataType
+else:
+    ArffContainerType = Dict[str, Any]
+
 
 def _build_re_values():
     quoted_re = r'''
@@ -778,7 +792,7 @@ def _decode(self, s, encode_nominal=False, matrix_type=DENSE):
             s = s.strip('\r\n ').replace('\r\n', '\n').split('\n')
 
         # Create the return object
-        obj = {
+        obj: ArffContainerType = {
             u'description': u'',
             u'relation': u'',
             u'attributes': [],
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index eef67d5460e22..1abed4ab9fa3c 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -22,7 +22,8 @@
 from ..utils.validation import _deprecate_positional_args
 from ..decomposition import PCA
 from ..metrics.pairwise import pairwise_distances
-from . import _utils
+# mypy error: Module 'sklearn.manifold' has no attribute '_utils'
+from . import _utils  # type: ignore
 # mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne'
 from . import _barnes_hut_tsne  # type: ignore
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 953584fff0f8a..1b44ad69d968a 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -18,7 +18,8 @@
 from distutils.version import LooseVersion
 from inspect import signature, isclass, Parameter
 
-from numpy.core.numeric import ComplexWarning
+# error: Module 'numpy.core.numeric' has no attribute 'ComplexWarning'
+from numpy.core.numeric import ComplexWarning  # type: ignore
 import joblib
 
 from contextlib import suppress

From ffa469d97042771ce185831631368d80aaf504eb Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 21:38:48 +0200
Subject: [PATCH 02/18] More types

---
 sklearn/datasets/_openml.py | 38 ++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index cb36734144f52..657a12505dbf5 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -18,7 +18,7 @@
 import scipy.sparse
 
 from ..externals import _arff
-from ..externals._arff import ArffDataType
+from ..externals._arff import ArffDataType, ArffContainerType
 from . import get_data_home
 from urllib.error import HTTPError
 from ..utils import Bunch
@@ -238,7 +238,7 @@ def _convert_arff_data(
     arff: Dict[str, Any],
     col_slice_x: List[int],
     col_slice_y: List[int],
-    shape: Optional[Tuple] =None
+    shape: Optional[Tuple] = None
 ) -> Tuple:
     """
     converts the arff object into the appropriate matrix type (np.array or
@@ -502,8 +502,12 @@ def _get_num_samples(data_qualities: Optional[ArffQuantileType]) -> int:
     return int(float(qualities.get('NumberOfInstances', default_n_samples)))
 
 
-def _load_arff_response(url, data_home, return_type, encode_nominal,
-                        parse_arff):
+def _load_arff_response(
+    url: str,
+    data_home: Optional[str],
+    return_type, encode_nominal: bool,
+    parse_arff: Callable[[ArffContainerType], Tuple]
+) -> Tuple:
     """Load arff data with url and parses arff response with parse_arff"""
     response = _open_openml_url(url, data_home)
 
@@ -516,9 +520,17 @@ def _load_arff_response(url, data_home, return_type, encode_nominal,
         return parse_arff(arff)
 
 
-def _download_data_to_bunch(url, sparse, data_home, *,
-                            as_frame, features_list, data_columns,
-                            target_columns, shape):
+def _download_data_to_bunch(
+    url: str,
+    sparse: bool,
+    data_home: Optional[str],
+    *,
+    as_frame: bool,
+    features_list: List,
+    data_columns: List[int],
+    target_columns: List,
+    shape: Optional[Tuple[int, int]]
+):
     """Download OpenML ARFF and convert to Bunch of data
     """
     # NB: this function is long in order to handle retry for any failure
@@ -552,12 +564,15 @@ def _download_data_to_bunch(url, sparse, data_home, *,
         return_type = _arff.DENSE_GEN
 
     frame = nominal_attributes = None
+
+    parse_arff: Callable
+    postprocess: Callable
     if as_frame:
         columns = data_columns + target_columns
         parse_arff = partial(_convert_arff_data_dataframe, columns=columns,
                              features_dict=features_dict)
 
-        def postprocess_frame(frame):
+        def postprocess(frame):
             X = frame[data_columns]
             if len(target_columns) >= 2:
                 y = frame[target_columns]
@@ -578,7 +593,7 @@ def parse_arff(arff):
                                   k in data_columns + target_columns}
             return X, y, nominal_attributes
 
-        def postprocess_X_y(X, y, nominal_attributes):
+        def postprocess(X, y, nominal_attributes):
             is_classification = {col_name in nominal_attributes
                                  for col_name in target_columns}
             if not is_classification:
@@ -609,10 +624,7 @@ def postprocess_X_y(X, y, nominal_attributes):
                              return_type=return_type,
                              encode_nominal=not as_frame,
                              parse_arff=parse_arff)
-    if as_frame:
-        X, y, frame, nominal_attributes = postprocess_frame(*out)
-    else:
-        X, y, frame, nominal_attributes = postprocess_X_y(*out)
+    X, y, frame, nominal_attributes = postprocess(*out)
 
     return Bunch(data=X, target=y, frame=frame,
                  categories=nominal_attributes,

From 14d34c489c699fde47feee40cb28adbac9326ae6 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 21:40:49 +0200
Subject: [PATCH 03/18] Fix merge conflicts

---
 sklearn/datasets/_openml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index c01e6719e8578..657a12505dbf5 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -572,7 +572,7 @@ def _download_data_to_bunch(
         parse_arff = partial(_convert_arff_data_dataframe, columns=columns,
                              features_dict=features_dict)
 
-        def postprocess(frame):  # type:ignore
+        def postprocess(frame):
             X = frame[data_columns]
             if len(target_columns) >= 2:
                 y = frame[target_columns]
@@ -593,7 +593,7 @@ def parse_arff(arff):
                                   k in data_columns + target_columns}
             return X, y, nominal_attributes
 
-        def postprocess(X, y, nominal_attributes):  # type:ignore
+        def postprocess(X, y, nominal_attributes):
             is_classification = {col_name in nominal_attributes
                                  for col_name in target_columns}
             if not is_classification:

From a31781b3db5846f6e3c425b98d7bd20de86e38ec Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 21:50:43 +0200
Subject: [PATCH 04/18] More fixes

---
 sklearn/datasets/_openml.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 657a12505dbf5..e3ba1b15dcc9e 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -35,8 +35,9 @@
 _DATA_QUALITIES = "api/v1/json/data/qualities/{}"
 _DATA_FILE = "data/v1/download/{}"
 
+OpenmlQuantileType = List[Dict[str, str]]
+OpenmlFeaturesType = List[Dict[str, str]]
 
-ArffQuantileType = List[Dict[str, str]]
 
 
 def _get_local_path(openml_path: str, data_home: str) -> str:
@@ -445,7 +446,7 @@ def _get_data_description_by_id(
 
 def _get_data_features(
     data_id: int, data_home: Optional[str]
-) -> List[Dict[str, str]]:
+) -> OpenmlFeaturesType:
     # OpenML function:
     # https://www.openml.org/api_docs#!/data/get_data_features_id
     url = _DATA_FEATURES.format(data_id)
@@ -460,7 +461,7 @@ def _get_data_features(
 
 def _get_data_qualities(
     data_id: int, data_home: Optional[str]
-) -> Optional[ArffQuantileType]:
+) -> Optional[OpenmlQuantileType]:
     # OpenML API function:
     # https://www.openml.org/api_docs#!/data/get_data_qualities_id
     url = _DATA_QUALITIES.format(data_id)
@@ -478,7 +479,7 @@ def _get_data_qualities(
         return None
 
 
-def _get_num_samples(data_qualities: Optional[ArffQuantileType]) -> int:
+def _get_num_samples(data_qualities: Optional[OpenmlQuantileType]) -> int:
     """Get the number of samples from data qualities.
 
     Parameters

From 59231e23e7a3aad46d8464bbf6adacec1548b941 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 22:08:11 +0200
Subject: [PATCH 05/18] Fixing CI

---
 azure-pipelines.yml        | 2 ++
 sklearn/externals/_arff.py | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 3e7b46039dfc4..6cef71b03671a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -18,11 +18,13 @@ jobs:
     - bash: sudo chown -R $USER $CONDA
       displayName: Take ownership of conda installation
     - bash: |
+        conda init bash
         conda create --name flake8_env --yes python=3.8
         conda activate flake8_env
         pip install flake8 mypy==0.770
       displayName: Install flake8
     - bash: |
+        set -ex
         if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[lint\ skip\] ]]; then
           # skip linting
           echo "Skipping linting"
diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py
index 6d0ed3345ca8f..de9d6e3a0d552 100644
--- a/sklearn/externals/_arff.py
+++ b/sklearn/externals/_arff.py
@@ -185,7 +185,6 @@ class ArffContainerType(TypedDict):
 else:
     ArffContainerType = Dict[str, Any]
 
-
 def _build_re_values():
     quoted_re = r'''
                     "      # open quote followed by zero or more of:

From 5fa0eddf5f4179331f031605f186c7e0c422b8e3 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 22:09:44 +0200
Subject: [PATCH 06/18] Lint

---
 sklearn/datasets/_openml.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index e3ba1b15dcc9e..bf10af738e9be 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -39,7 +39,6 @@
 OpenmlFeaturesType = List[Dict[str, str]]
 
 
-
 def _get_local_path(openml_path: str, data_home: str) -> str:
     return os.path.join(data_home, 'openml.org', openml_path + ".gz")
 

From d202ecc97427702a74877a7141d79a1a743139eb Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 22:17:10 +0200
Subject: [PATCH 07/18] Conda activate doesn't work

---
 azure-pipelines.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 6cef71b03671a..c850154afb24b 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -18,9 +18,9 @@ jobs:
     - bash: sudo chown -R $USER $CONDA
       displayName: Take ownership of conda installation
     - bash: |
-        conda init bash
+        set -ex
         conda create --name flake8_env --yes python=3.8
-        conda activate flake8_env
+        source activate flake8_env
         pip install flake8 mypy==0.770
       displayName: Install flake8
     - bash: |
@@ -30,7 +30,7 @@ jobs:
           echo "Skipping linting"
           exit 0
         else
-          conda activate flake8_env
+          source activate flake8_env
           ./build_tools/circle/linting.sh
         fi
       displayName: Run linting
@@ -41,7 +41,7 @@ jobs:
           echo "Skipping linting"
           exit 0
         else
-          conda activate flake8_env
+          source activate flake8_env
           mypy sklearn/ --ignore-missing-imports
         fi
       displayName: Run mypy

From e50203dd83e86d55242cadb3500c5ff5314becf5 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 22:31:29 +0200
Subject: [PATCH 08/18] Simplify exception handling

---
 sklearn/datasets/_openml.py | 56 +++++++++++++++----------------------
 1 file changed, 22 insertions(+), 34 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index bf10af738e9be..0c625c5f2eae0 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -127,12 +127,15 @@ def is_gzip(_fsrc):
     return gzip.GzipFile(local_path, 'rb')
 
 
+class OpenMLError(Exception):
+    pass
+
+
 def _get_json_content_from_openml_api(
     url: str,
     error_message: Optional[str],
-    raise_if_error: bool,
     data_home: Optional[str]
-) -> Optional[Dict]:
+) -> Dict:
     """
     Loads json data from the openml api
 
@@ -146,21 +149,14 @@ def _get_json_content_from_openml_api(
         (acceptable error is, e.g., data id not found. Other errors, like 404's
         will throw the native error message)
 
-    raise_if_error : bool
-        Whether to raise an error if OpenML returns an acceptable error (e.g.,
-        date not found). If this argument is set to False, a None is returned
-        in case of acceptable errors. Note that all other errors (e.g., 404)
-        will still be raised as normal.
-
     data_home : str or None
         Location to cache the response. None if no cache is required.
 
     Returns
     -------
-    json_data : json or None
-        the json result from the OpenML server if the call was successful;
-        None otherwise iff raise_if_error was set to False and the error was
-        ``acceptable``
+    json_data : json
+        the json result from the OpenML server if the call was successful.
+        An exception otherwise.
     """
 
     @_retry_with_clean_cache(url, data_home)
@@ -177,9 +173,7 @@ def _load_json():
             raise error
 
     # 412 error, not in except for nicer traceback
-    if raise_if_error:
-        raise ValueError(error_message)
-    return None
+    raise OpenMLError(error_message)
 
 
 def _split_sparse_columns(
@@ -395,10 +389,8 @@ def _get_data_info_by_name(
         url = _SEARCH_NAME.format(name) + "/status/active/"
         error_msg = "No active dataset {} found.".format(name)
         json_data = _get_json_content_from_openml_api(
-            url, error_msg, raise_if_error=True, data_home=data_home
+            url, error_msg, data_home=data_home
         )
-        # json_data should never be None with raise_if_error=True
-        assert json_data is not None
         res = json_data['data']['dataset']
         if len(res) > 1:
             warn("Multiple active versions of the dataset matching the name"
@@ -409,10 +401,14 @@ def _get_data_info_by_name(
 
     # an integer version has been provided
     url = (_SEARCH_NAME + "/data_version/{}").format(name, version)
-    json_data = _get_json_content_from_openml_api(
-        url, error_message=None, raise_if_error=False, data_home=data_home
-    )
-    if json_data is None:
+    try:
+        json_data = _get_json_content_from_openml_api(
+            url, error_message=None, data_home=data_home
+        )
+        retry = False
+    except OpenMLError:
+        retry = True
+    if retry:
         # we can do this in 1 function call if OpenML does not require the
         # specification of the dataset status (i.e., return datasets with a
         # given name / version regardless of active, deactivated, etc. )
@@ -421,10 +417,8 @@ def _get_data_info_by_name(
         error_msg = "Dataset {} with version {} not found.".format(name,
                                                                    version)
         json_data = _get_json_content_from_openml_api(
-            url, error_msg, raise_if_error=True, data_home=data_home
+            url, error_msg, data_home=data_home
         )
-        # json_data should never be None with raise_if_error=True
-        assert json_data is not None
 
     return json_data['data']['dataset'][0]
 
@@ -436,10 +430,8 @@ def _get_data_description_by_id(
     url = _DATA_INFO.format(data_id)
     error_message = "Dataset with data_id {} not found.".format(data_id)
     json_data = _get_json_content_from_openml_api(
-        url, error_message, raise_if_error=True, data_home=data_home
+        url, error_message, data_home=data_home
     )
-    # json_data should never be None with raise_if_error=True
-    assert json_data is not None
     return json_data['data_set_description']
 
 
@@ -451,10 +443,8 @@ def _get_data_features(
     url = _DATA_FEATURES.format(data_id)
     error_message = "Dataset with data_id {} not found.".format(data_id)
     json_data = _get_json_content_from_openml_api(
-        url, error_message, raise_if_error=True, data_home=data_home
+        url, error_message, data_home=data_home
     )
-    # json_data should never be None with raise_if_error=True
-    assert json_data is not None
     return json_data['data_features']['feature']
 
 
@@ -466,10 +456,8 @@ def _get_data_qualities(
     url = _DATA_QUALITIES.format(data_id)
     error_message = "Dataset with data_id {} not found.".format(data_id)
     json_data = _get_json_content_from_openml_api(
-        url, error_message, raise_if_error=True, data_home=data_home
+        url, error_message, data_home=data_home
     )
-    # json_data should never be None with raise_if_error=True
-    assert json_data is not None
     try:
         return json_data['data_qualities']['quality']
     except KeyError:

From 00edef4205bcbc0e2717f5eac3b1ad120f5b666a Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 22:36:44 +0200
Subject: [PATCH 09/18] Simplify data_qualities

---
 sklearn/datasets/_openml.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 0c625c5f2eae0..305a8fb6e67c3 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -35,7 +35,7 @@
 _DATA_QUALITIES = "api/v1/json/data/qualities/{}"
 _DATA_FILE = "data/v1/download/{}"
 
-OpenmlQuantileType = List[Dict[str, str]]
+OpenmlQualitiesType = List[Dict[str, str]]
 OpenmlFeaturesType = List[Dict[str, str]]
 
 
@@ -450,7 +450,7 @@ def _get_data_features(
 
 def _get_data_qualities(
     data_id: int, data_home: Optional[str]
-) -> Optional[OpenmlQuantileType]:
+) -> OpenmlQualitiesType:
     # OpenML API function:
     # https://www.openml.org/api_docs#!/data/get_data_qualities_id
     url = _DATA_QUALITIES.format(data_id)
@@ -458,15 +458,12 @@ def _get_data_qualities(
     json_data = _get_json_content_from_openml_api(
         url, error_message, data_home=data_home
     )
-    try:
-        return json_data['data_qualities']['quality']
-    except KeyError:
-        # the qualities might not be available, but we still try to process
-        # the data
-        return None
+    # the qualities might not be available, but we still try to process
+    # the data
+    return json_data['data_qualities'].get('quality', [])
 
 
-def _get_num_samples(data_qualities: Optional[OpenmlQuantileType]) -> int:
+def _get_num_samples(data_qualities: OpenmlQualitiesType) -> int:
     """Get the number of samples from data qualities.
 
     Parameters
@@ -483,9 +480,6 @@ def _get_num_samples(data_qualities: Optional[OpenmlQuantileType]) -> int:
     # If the data qualities are unavailable, we return -1
     default_n_samples = -1
 
-    if data_qualities is None:
-        return default_n_samples
-
     qualities = {d['name']: d['value'] for d in data_qualities}
     return int(float(qualities.get('NumberOfInstances', default_n_samples)))
 

From 937f606d9907f4bd601a8f03cd33325679cc6a63 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 22:47:09 +0200
Subject: [PATCH 10/18] Fix tests

---
 sklearn/datasets/_openml.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 305a8fb6e67c3..2f333d3594a4f 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -88,7 +88,7 @@ def _open_openml_url(openml_path: str, data_home: Optional[str]):
     result : stream
         A stream to the OpenML resource
     """
-    def is_gzip(_fsrc):
+    def is_gzip_encoded(_fsrc):
         return _fsrc.info().get('Content-Encoding', '') == 'gzip'
 
     req = Request(_OPENML_PREFIX + openml_path)
@@ -96,7 +96,7 @@ def is_gzip(_fsrc):
 
     if data_home is None:
         fsrc = urlopen(req)
-        if is_gzip(fsrc):
+        if is_gzip_encoded(fsrc):
             return gzip.GzipFile(fileobj=fsrc, mode='rb')
         return fsrc
 
@@ -111,10 +111,10 @@ def is_gzip(_fsrc):
         try:
             with closing(urlopen(req)) as fsrc:
                 opener: Callable
-                if is_gzip(fsrc):
-                    opener = gzip.GzipFile
-                else:
+                if is_gzip_encoded(fsrc):
                     opener = open
+                else:
+                    opener = gzip.GzipFile
                 with opener(local_path, 'wb') as fdst:
                     shutil.copyfileobj(fsrc, fdst)
         except Exception:
@@ -127,7 +127,8 @@ def is_gzip(_fsrc):
     return gzip.GzipFile(local_path, 'rb')
 
 
-class OpenMLError(Exception):
+class OpenMLError(ValueError):
+    """HTTP 412 is a specific OpenML error code, indicating a generic error"""
     pass
 
 
@@ -460,7 +461,7 @@ def _get_data_qualities(
     )
     # the qualities might not be available, but we still try to process
     # the data
-    return json_data['data_qualities'].get('quality', [])
+    return json_data.get('data_qualities', {}).get('quality', [])
 
 
 def _get_num_samples(data_qualities: OpenmlQualitiesType) -> int:

From 55ad34b49791eae65c4cad5b00895c9fa43a7037 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Wed, 3 Jun 2020 22:29:42 +0200
Subject: [PATCH 11/18] Apply suggestions from code review

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/datasets/_openml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 2f333d3594a4f..edfd64923b64d 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -230,7 +230,7 @@ def _sparse_data_to_array(
 
 
 def _convert_arff_data(
-    arff: Dict[str, Any],
+    arff: ArffContainerType,
     col_slice_x: List[int],
     col_slice_y: List[int],
     shape: Optional[Tuple] = None
@@ -307,7 +307,7 @@ def _feature_to_dtype(feature: Dict[str, str]):
 
 
 def _convert_arff_data_dataframe(
-    arff, columns: List, features_dict: Dict[str, Any]
+    arff: ArffContainerType, columns: List, features_dict: Dict[str, Any]
 ) -> Tuple:
     """Convert the ARFF object into a pandas DataFrame.
 

From 3856d5b5dac22cc12d5d29e261b9e999f3e21969 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jun 2020 10:52:22 +0200
Subject: [PATCH 12/18] Address review comments

---
 .pre-commit-config.yaml     | 2 +-
 azure-pipelines.yml         | 2 +-
 sklearn/datasets/_openml.py | 9 ++++-----
 sklearn/externals/_arff.py  | 6 ++++--
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index aa8df3c3cbc87..bf1bef99a5642 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@ repos:
         # the code is not fully PEP8 compatible
         args: [--select=F401]
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.730
+    rev: v0.780
     hooks:
      -  id: mypy
         args:
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index fb533177b11c6..afb67e837a5bd 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -17,7 +17,7 @@ jobs:
       inputs:
         versionSpec: '3.8'
     - bash: |
-        pip install flake8 mypy==0.770
+        pip install flake8 mypy==0.780
       displayName: Install linters
     - bash: |
         set -ex
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 1f91925d70d73..9ac65d4f92d63 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -198,11 +198,11 @@ def _split_sparse_columns(
 
     Returns
     -------
-    arff_data_new : tuple
+    arff_data_new : list
         Subset of arff data with only the include columns indicated by the
         include_columns argument.
     """
-    arff_data_new: ArffDataType = (list(), list(), list())
+    arff_data_new: ArffDataType = [list(), list(), list()]
     reindexed_columns = {column_idx: array_idx for array_idx, column_idx
                          in enumerate(include_columns)}
     for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
@@ -334,6 +334,7 @@ def _convert_arff_data_dataframe(
     arff_columns = list(attributes)
 
     # calculate chunksize
+    raise ValueError
     first_row = next(arff['data'])
     first_df = pd.DataFrame([first_row], columns=arff_columns)
 
@@ -407,10 +408,7 @@ def _get_data_info_by_name(
         json_data = _get_json_content_from_openml_api(
             url, error_message=None, data_home=data_home
         )
-        retry = False
     except OpenMLError:
-        retry = True
-    if retry:
         # we can do this in 1 function call if OpenML does not require the
         # specification of the dataset status (i.e., return datasets with a
         # given name / version regardless of active, deactivated, etc. )
@@ -658,6 +656,7 @@ def _valid_data_column_names(features_list, target_columns):
     return valid_data_column_names
 
 
+@_deprecate_positional_args
 def fetch_openml(
     name: Optional[str] = None,
     *,
diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py
index de9d6e3a0d552..36ac9c0022680 100644
--- a/sklearn/externals/_arff.py
+++ b/sklearn/externals/_arff.py
@@ -149,7 +149,7 @@
 __version__ = '2.4.0'
 
 import typing
-from typing import Optional, Tuple, List, Dict, Any
+from typing import Optional, List, Dict, Any
 
 import re
 import sys
@@ -172,9 +172,10 @@
 _RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE)
 _RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]', re.UNICODE)
 
-ArffDataType = Tuple[List, ...]
+ArffDataType = List[List]
 
 if typing.TYPE_CHECKING:
+    # typing_extensions is only available is mypy is installed
     from typing_extensions import TypedDict
 
     class ArffContainerType(TypedDict):
@@ -185,6 +186,7 @@ class ArffContainerType(TypedDict):
 else:
     ArffContainerType = Dict[str, Any]
 
+
 def _build_re_values():
     quoted_re = r'''
                     "      # open quote followed by zero or more of:

From c514c5326114ee42e0130a1c9223116556ff679b Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jun 2020 12:55:06 +0200
Subject: [PATCH 13/18] More fixes

---
 sklearn/datasets/_openml.py | 14 ++++++++------
 sklearn/externals/_arff.py  |  9 ++++++---
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 9ac65d4f92d63..a4e8af9570996 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -18,7 +18,7 @@
 import scipy.sparse
 
 from ..externals import _arff
-from ..externals._arff import ArffDataType, ArffContainerType
+from ..externals._arff import ArffSparseDataType, ArffContainerType
 from . import get_data_home
 from urllib.error import HTTPError
 from ..utils import Bunch
@@ -179,8 +179,8 @@ def _load_json():
 
 
 def _split_sparse_columns(
-    arff_data: ArffDataType, include_columns: List
-) -> ArffDataType:
+    arff_data: ArffSparseDataType, include_columns: List
+) -> ArffSparseDataType:
     """
     obtains several columns from sparse arff representation. Additionally, the
     column indices are re-labelled, given the columns that are not included.
@@ -202,7 +202,7 @@ def _split_sparse_columns(
         Subset of arff data with only the include columns indicated by the
         include_columns argument.
     """
-    arff_data_new: ArffDataType = [list(), list(), list()]
+    arff_data_new: ArffSparseDataType = (list(), list(), list())
     reindexed_columns = {column_idx: array_idx for array_idx, column_idx
                          in enumerate(include_columns)}
     for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
@@ -214,7 +214,7 @@ def _split_sparse_columns(
 
 
 def _sparse_data_to_array(
-    arff_data: ArffDataType, include_columns: List
+    arff_data: ArffSparseDataType, include_columns: List
 ) -> np.ndarray:
     # turns the sparse data back into an array (can't use toarray() function,
     # as this does only work on numeric data)
@@ -333,8 +333,10 @@ def _convert_arff_data_dataframe(
     attributes = OrderedDict(arff['attributes'])
     arff_columns = list(attributes)
 
+    if isinstance(arff['data'], tuple):
+        raise ValueError("Unreachable code. arff['data'] must be a generator.")
+
     # calculate chunksize
-    raise ValueError
     first_row = next(arff['data'])
     first_df = pd.DataFrame([first_row], columns=arff_columns)
 
diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py
index 36ac9c0022680..1e4cc0fd69d21 100644
--- a/sklearn/externals/_arff.py
+++ b/sklearn/externals/_arff.py
@@ -149,7 +149,7 @@
 __version__ = '2.4.0'
 
 import typing
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Iterator, Union, Tuple
 
 import re
 import sys
@@ -172,7 +172,9 @@
 _RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE)
 _RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]', re.UNICODE)
 
-ArffDataType = List[List]
+ArffDenseDataType = Iterator[List]
+ArffSparseDataType = Tuple[List, ...]
+
 
 if typing.TYPE_CHECKING:
     # typing_extensions is only available is mypy is installed
@@ -182,7 +184,8 @@ class ArffContainerType(TypedDict):
         description: str
         relation: str
         attributes: List
-        data: ArffDataType
+        data: Union[ArffDenseDataType, ArffSparseDataType]
+
 else:
     ArffContainerType = Dict[str, Any]
 

From ebf1b59520c4f82028108cc15b8a8c57869265d5 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jun 2020 13:03:06 +0200
Subject: [PATCH 14/18] Another fix

---
 sklearn/datasets/_openml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index a4e8af9570996..a5ed71d075fa9 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -198,7 +198,7 @@ def _split_sparse_columns(
 
     Returns
     -------
-    arff_data_new : list
+    arff_data_new : tuple
         Subset of arff data with only the include columns indicated by the
         include_columns argument.
     """

From 55ec82bcbd806a545ebb6f0b369be5e795c70a89 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jun 2020 13:05:34 +0200
Subject: [PATCH 15/18] Another typo

---
 sklearn/externals/_arff.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py
index 1e4cc0fd69d21..fa75808357321 100644
--- a/sklearn/externals/_arff.py
+++ b/sklearn/externals/_arff.py
@@ -177,7 +177,7 @@
 
 
 if typing.TYPE_CHECKING:
-    # typing_extensions is only available is mypy is installed
+    # typing_extensions is available when mypy is installed
     from typing_extensions import TypedDict
 
     class ArffContainerType(TypedDict):

From ebf80d1ead08df32c6eaa9cae2e6302bfbed0acd Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 9 Jun 2020 16:41:46 +0200
Subject: [PATCH 16/18] Check arff['data'] for being an iterable instead

---
 sklearn/datasets/_openml.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index a5ed71d075fa9..01a238902dfc0 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -333,8 +333,10 @@ def _convert_arff_data_dataframe(
     attributes = OrderedDict(arff['attributes'])
     arff_columns = list(attributes)
 
-    if isinstance(arff['data'], tuple):
-        raise ValueError("Unreachable code. arff['data'] must be a generator.")
+    if not isinstance(arff['data'], Generator):
+        raise ValueError(
+            "arff['data'] must be a generator when converting to pd.DataFrame."
+        )
 
     # calculate chunksize
     first_row = next(arff['data'])

From 8c1fbfa2b1b952efc733bfffccdf7846b00fa02f Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 25 Jun 2020 17:40:22 +0200
Subject: [PATCH 17/18] Add requested tests

---
 sklearn/datasets/tests/test_openml.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 950c208444b7d..2f935072cb351 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -14,6 +14,8 @@
 from sklearn.datasets._openml import (_open_openml_url,
                                       _arff,
                                       _DATA_FILE,
+                                      _convert_arff_data,
+                                      _convert_arff_data_dataframe,
                                       _get_data_description_by_id,
                                       _get_local_path,
                                       _retry_with_clean_cache,
@@ -24,6 +26,7 @@
 from sklearn.utils._testing import assert_allclose, assert_array_equal
 from urllib.error import HTTPError
 from sklearn.datasets.tests.test_common import check_return_X_y
+from sklearn.externals._arff import ArffContainerType
 from functools import partial
 
 
@@ -1204,3 +1207,25 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
     # so we assert that we don't have the ignored feature in the final Bunch
     assert dataset['data'].shape == (101, 16)
     assert 'animal' not in dataset['feature_names']
+
+
+def test_convert_arff_data_type():
+    arff: ArffContainerType = {
+            'data': (el for el in range(2)),
+            'description': '',
+            'relation': '',
+            'attributes': []
+    }
+    msg = r"shape must be provided when arr\['data'\] is a Generator"
+    with pytest.raises(ValueError, match=msg):
+        _convert_arff_data(arff, [0], [0], shape=None)
+
+    arff = {
+            'data': list(range(2)),
+            'description': '',
+            'relation': '',
+            'attributes': []
+    }
+    msg = r"arff\['data'\] must be a generator when converting to pd.DataFrame"
+    with pytest.raises(ValueError, match=msg):
+        _convert_arff_data_dataframe(arff, ['a'], {})

From c9a974a5a9de26471a2fa5d1c5a2d9d939bdedaa Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 25 Jun 2020 17:57:23 +0200
Subject: [PATCH 18/18] Only run test if pandas is installed

---
 sklearn/datasets/tests/test_openml.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 2f935072cb351..b16a34727c2aa 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -1210,6 +1210,8 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response):
 
 
 def test_convert_arff_data_type():
+    pytest.importorskip('pandas')
+
     arff: ArffContainerType = {
             'data': (el for el in range(2)),
             'description': '',