From 7692122a8b26c6b1913cdeb3c93ec271a582377d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 20:18:45 +0200 Subject: [PATCH 01/18] Type annotations for OpenML fetcher --- azure-pipelines.yml | 1 + sklearn/cluster/_agglomerative.py | 3 +- sklearn/datasets/_base.py | 2 +- sklearn/datasets/_openml.py | 138 ++++++++++++++++++++++-------- sklearn/externals/_arff.py | 18 +++- sklearn/manifold/_t_sne.py | 3 +- sklearn/utils/validation.py | 3 +- 7 files changed, 124 insertions(+), 44 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 196d4ca34f434..3e7b46039dfc4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -33,6 +33,7 @@ jobs: fi displayName: Run linting - bash: | + set -ex if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[lint\ skip\] ]]; then # skip linting echo "Skipping linting" diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 92246141d6fe8..94c057fbea09d 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -21,7 +21,8 @@ from ..neighbors import DistanceMetric from ..neighbors._dist_metrics import METRIC_MAPPING -from . import _hierarchical_fast as _hierarchical +# error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast' +from . import _hierarchical_fast as _hierarchical # type: ignore from ._feature_agglomeration import AgglomerationTransform from ..utils._fast_dict import IntFloatDict from ..utils.fixes import _astype_copy_false diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 9737a5f67891a..f88b7fbc81ee5 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -26,7 +26,7 @@ ['filename', 'url', 'checksum']) -def get_data_home(data_home=None): +def get_data_home(data_home=None) -> str: """Return the path of the scikit-learn data dir. This folder is used by some large dataset loaders to avoid downloading the diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 10f40dc8906bf..cb36734144f52 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -6,6 +6,7 @@ from warnings import warn from contextlib import closing from functools import wraps +from typing import Callable, Optional, Dict, Tuple, List, Any, Union import itertools from collections.abc import Generator from collections import OrderedDict @@ -17,6 +18,7 @@ import scipy.sparse from ..externals import _arff +from ..externals._arff import ArffDataType from . import get_data_home from urllib.error import HTTPError from ..utils import Bunch @@ -34,11 +36,16 @@ _DATA_FILE = "data/v1/download/{}" -def _get_local_path(openml_path, data_home): +ArffQuantileType = List[Dict[str, str]] + + +def _get_local_path(openml_path: str, data_home: str) -> str: return os.path.join(data_home, 'openml.org', openml_path + ".gz") -def _retry_with_clean_cache(openml_path, data_home): +def _retry_with_clean_cache( + openml_path: str, data_home: Optional[str] +) -> Callable: """If the first call to the decorated function fails, the local cached file is removed, and the function is called again. If ``data_home`` is ``None``, then the function is called once. @@ -62,7 +69,7 @@ def wrapper(*args, **kw): return decorator -def _open_openml_url(openml_path, data_home): +def _open_openml_url(openml_path: str, data_home: Optional[str]): """ Returns a resource from OpenML.org. Caches it to data_home if required. @@ -103,12 +110,13 @@ def is_gzip(_fsrc): try: with closing(urlopen(req)) as fsrc: + opener: Callable if is_gzip(fsrc): - with open(local_path, 'wb') as fdst: - shutil.copyfileobj(fsrc, fdst) + opener = gzip.GzipFile else: - with gzip.GzipFile(local_path, 'wb') as fdst: - shutil.copyfileobj(fsrc, fdst) + opener = open + with opener(local_path, 'wb') as fdst: + shutil.copyfileobj(fsrc, fdst) except Exception: if os.path.exists(local_path): os.unlink(local_path) @@ -119,8 +127,12 @@ def is_gzip(_fsrc): return gzip.GzipFile(local_path, 'rb') -def _get_json_content_from_openml_api(url, error_message, raise_if_error, - data_home): +def _get_json_content_from_openml_api( + url: str, + error_message: Optional[str], + raise_if_error: bool, + data_home: Optional[str] +) -> Optional[Dict]: """ Loads json data from the openml api @@ -170,7 +182,9 @@ def _load_json(): return None -def _split_sparse_columns(arff_data, include_columns): +def _split_sparse_columns( + arff_data: ArffDataType, include_columns: List +) -> ArffDataType: """ obtains several columns from sparse arff representation. Additionally, the column indices are re-labelled, given the columns that are not included. @@ -192,7 +206,7 @@ def _split_sparse_columns(arff_data, include_columns): Subset of arff data with only the include columns indicated by the include_columns argument. """ - arff_data_new = (list(), list(), list()) + arff_data_new: ArffDataType = (list(), list(), list()) reindexed_columns = {column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)} for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]): @@ -203,7 +217,9 @@ def _split_sparse_columns(arff_data, include_columns): return arff_data_new -def _sparse_data_to_array(arff_data, include_columns): +def _sparse_data_to_array( + arff_data: ArffDataType, include_columns: List +) -> np.ndarray: # turns the sparse data back into an array (can't use toarray() function, # as this does only work on numeric data) num_obs = max(arff_data[1]) + 1 @@ -218,7 +234,12 @@ def _sparse_data_to_array(arff_data, include_columns): return y -def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None): +def _convert_arff_data( + arff: Dict[str, Any], + col_slice_x: List[int], + col_slice_y: List[int], + shape: Optional[Tuple] =None +) -> Tuple: """ converts the arff object into the appropriate matrix type (np.array or scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the @@ -244,6 +265,10 @@ def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None): """ arff_data = arff['data'] if isinstance(arff_data, Generator): + if shape is None: + raise ValueError( + "shape must be provided when arr['data'] is a Generator" + ) if shape[0] == -1: count = -1 else: @@ -269,7 +294,7 @@ def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None): raise ValueError('Unexpected Data Type obtained from arff.') -def _feature_to_dtype(feature): +def _feature_to_dtype(feature: Dict[str, str]): """Map feature to dtype for pandas DataFrame """ if feature['data_type'] == 'string': @@ -286,7 +311,9 @@ def _feature_to_dtype(feature): raise ValueError('Unsupported feature: {}'.format(feature)) -def _convert_arff_data_dataframe(arff, columns, features_dict): +def _convert_arff_data_dataframe( + arff, columns: List, features_dict: Dict[str, Any] +) -> Tuple: """Convert the ARFF object into a pandas DataFrame. Parameters @@ -333,7 +360,9 @@ def _convert_arff_data_dataframe(arff, columns, features_dict): return (df, ) -def _get_data_info_by_name(name, version, data_home): +def _get_data_info_by_name( + name: str, version: Union[int, str], data_home: Optional[str] +): """ Utilizes the openml dataset listing api to find a dataset by name/version @@ -365,8 +394,11 @@ def _get_data_info_by_name(name, version, data_home): # situation in which we return the oldest active version url = _SEARCH_NAME.format(name) + "/status/active/" error_msg = "No active dataset {} found.".format(name) - json_data = _get_json_content_from_openml_api(url, error_msg, True, - data_home) + json_data = _get_json_content_from_openml_api( + url, error_msg, raise_if_error=True, data_home=data_home + ) + # json_data should never be None with raise_if_error=True + assert json_data is not None res = json_data['data']['dataset'] if len(res) > 1: warn("Multiple active versions of the dataset matching the name" @@ -377,8 +409,9 @@ def _get_data_info_by_name(name, version, data_home): # an integer version has been provided url = (_SEARCH_NAME + "/data_version/{}").format(name, version) - json_data = _get_json_content_from_openml_api(url, None, False, - data_home) + json_data = _get_json_content_from_openml_api( + url, error_message=None, raise_if_error=False, data_home=data_home + ) if json_data is None: # we can do this in 1 function call if OpenML does not require the # specification of the dataset status (i.e., return datasets with a @@ -387,38 +420,56 @@ def _get_data_info_by_name(name, version, data_home): url += "/status/deactivated" error_msg = "Dataset {} with version {} not found.".format(name, version) - json_data = _get_json_content_from_openml_api(url, error_msg, True, - data_home) + json_data = _get_json_content_from_openml_api( + url, error_msg, raise_if_error=True, data_home=data_home + ) + # json_data should never be None with raise_if_error=True + assert json_data is not None return json_data['data']['dataset'][0] -def _get_data_description_by_id(data_id, data_home): +def _get_data_description_by_id( + data_id: int, data_home: Optional[str] +) -> Dict[str, Any]: # OpenML API function: https://www.openml.org/api_docs#!/data/get_data_id url = _DATA_INFO.format(data_id) error_message = "Dataset with data_id {} not found.".format(data_id) - json_data = _get_json_content_from_openml_api(url, error_message, True, - data_home) + json_data = _get_json_content_from_openml_api( + url, error_message, raise_if_error=True, data_home=data_home + ) + # json_data should never be None with raise_if_error=True + assert json_data is not None return json_data['data_set_description'] -def _get_data_features(data_id, data_home): +def _get_data_features( + data_id: int, data_home: Optional[str] +) -> List[Dict[str, str]]: # OpenML function: # https://www.openml.org/api_docs#!/data/get_data_features_id url = _DATA_FEATURES.format(data_id) error_message = "Dataset with data_id {} not found.".format(data_id) - json_data = _get_json_content_from_openml_api(url, error_message, True, - data_home) + json_data = _get_json_content_from_openml_api( + url, error_message, raise_if_error=True, data_home=data_home + ) + # json_data should never be None with raise_if_error=True + assert json_data is not None return json_data['data_features']['feature'] -def _get_data_qualities(data_id, data_home): +def _get_data_qualities( + data_id: int, data_home: Optional[str] +) -> Optional[ArffQuantileType]: # OpenML API function: # https://www.openml.org/api_docs#!/data/get_data_qualities_id url = _DATA_QUALITIES.format(data_id) error_message = "Dataset with data_id {} not found.".format(data_id) - json_data = _get_json_content_from_openml_api(url, error_message, True, - data_home) + json_data = _get_json_content_from_openml_api( + url, error_message, raise_if_error=True, data_home=data_home + ) + # json_data should never be None with raise_if_error=True + assert json_data is not None try: return json_data['data_qualities']['quality'] except KeyError: @@ -427,7 +478,7 @@ def _get_data_qualities(data_id, data_home): return None -def _get_num_samples(data_qualities): +def _get_num_samples(data_qualities: Optional[ArffQuantileType]) -> int: """Get the number of samples from data qualities. Parameters @@ -506,7 +557,7 @@ def _download_data_to_bunch(url, sparse, data_home, *, parse_arff = partial(_convert_arff_data_dataframe, columns=columns, features_dict=features_dict) - def postprocess(frame): + def postprocess_frame(frame): X = frame[data_columns] if len(target_columns) >= 2: y = frame[target_columns] @@ -527,7 +578,7 @@ def parse_arff(arff): k in data_columns + target_columns} return X, y, nominal_attributes - def postprocess(X, y, nominal_attributes): + def postprocess_X_y(X, y, nominal_attributes): is_classification = {col_name in nominal_attributes for col_name in target_columns} if not is_classification: @@ -558,7 +609,10 @@ def postprocess(X, y, nominal_attributes): return_type=return_type, encode_nominal=not as_frame, parse_arff=parse_arff) - X, y, frame, nominal_attributes = postprocess(*out) + if as_frame: + X, y, frame, nominal_attributes = postprocess_frame(*out) + else: + X, y, frame, nominal_attributes = postprocess_X_y(*out) return Bunch(data=X, target=y, frame=frame, categories=nominal_attributes, @@ -608,9 +662,16 @@ def _valid_data_column_names(features_list, target_columns): return valid_data_column_names -def fetch_openml(name=None, version='active', data_id=None, data_home=None, - target_column='default-target', cache=True, return_X_y=False, - as_frame=False): +def fetch_openml( + name: Optional[str] = None, + version: Union[str, int] = 'active', + data_id: Optional[int] = None, + data_home: Optional[str] = None, + target_column: Optional[Union[str, List]] = 'default-target', + cache: bool = True, + return_X_y: bool = False, + as_frame: bool = False +): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a @@ -800,6 +861,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, data_columns = _valid_data_column_names(features_list, target_columns) + shape: Optional[Tuple[int, int]] # determine arff encoding to return if not return_sparse: # The shape must include the ignored features to keep the right indexes diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py index 63a39c3c15d4a..6d0ed3345ca8f 100644 --- a/sklearn/externals/_arff.py +++ b/sklearn/externals/_arff.py @@ -148,7 +148,8 @@ 'joel.nothman@gmail.com') __version__ = '2.4.0' -from typing import Optional +import typing +from typing import Optional, Tuple, List, Dict, Any import re import sys @@ -171,6 +172,19 @@ _RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE) _RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]', re.UNICODE) +ArffDataType = Tuple[List, ...] + +if typing.TYPE_CHECKING: + from typing_extensions import TypedDict + + class ArffContainerType(TypedDict): + description: str + relation: str + attributes: List + data: ArffDataType +else: + ArffContainerType = Dict[str, Any] + def _build_re_values(): quoted_re = r''' @@ -778,7 +792,7 @@ def _decode(self, s, encode_nominal=False, matrix_type=DENSE): s = s.strip('\r\n ').replace('\r\n', '\n').split('\n') # Create the return object - obj = { + obj: ArffContainerType = { u'description': u'', u'relation': u'', u'attributes': [], diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index eef67d5460e22..1abed4ab9fa3c 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -22,7 +22,8 @@ from ..utils.validation import _deprecate_positional_args from ..decomposition import PCA from ..metrics.pairwise import pairwise_distances -from . import _utils +# mypy error: Module 'sklearn.manifold' has no attribute '_utils' +from . import _utils # type: ignore # mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne' from . import _barnes_hut_tsne # type: ignore diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 953584fff0f8a..1b44ad69d968a 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -18,7 +18,8 @@ from distutils.version import LooseVersion from inspect import signature, isclass, Parameter -from numpy.core.numeric import ComplexWarning +# error: Module 'numpy.core.numeric' has no attribute 'ComplexWarning' +from numpy.core.numeric import ComplexWarning # type: ignore import joblib from contextlib import suppress From ffa469d97042771ce185831631368d80aaf504eb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 21:38:48 +0200 Subject: [PATCH 02/18] More types --- sklearn/datasets/_openml.py | 38 ++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index cb36734144f52..657a12505dbf5 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -18,7 +18,7 @@ import scipy.sparse from ..externals import _arff -from ..externals._arff import ArffDataType +from ..externals._arff import ArffDataType, ArffContainerType from . import get_data_home from urllib.error import HTTPError from ..utils import Bunch @@ -238,7 +238,7 @@ def _convert_arff_data( arff: Dict[str, Any], col_slice_x: List[int], col_slice_y: List[int], - shape: Optional[Tuple] =None + shape: Optional[Tuple] = None ) -> Tuple: """ converts the arff object into the appropriate matrix type (np.array or @@ -502,8 +502,12 @@ def _get_num_samples(data_qualities: Optional[ArffQuantileType]) -> int: return int(float(qualities.get('NumberOfInstances', default_n_samples))) -def _load_arff_response(url, data_home, return_type, encode_nominal, - parse_arff): +def _load_arff_response( + url: str, + data_home: Optional[str], + return_type, encode_nominal: bool, + parse_arff: Callable[[ArffContainerType], Tuple] +) -> Tuple: """Load arff data with url and parses arff response with parse_arff""" response = _open_openml_url(url, data_home) @@ -516,9 +520,17 @@ def _load_arff_response(url, data_home, return_type, encode_nominal, return parse_arff(arff) -def _download_data_to_bunch(url, sparse, data_home, *, - as_frame, features_list, data_columns, - target_columns, shape): +def _download_data_to_bunch( + url: str, + sparse: bool, + data_home: Optional[str], + *, + as_frame: bool, + features_list: List, + data_columns: List[int], + target_columns: List, + shape: Optional[Tuple[int, int]] +): """Download OpenML ARFF and convert to Bunch of data """ # NB: this function is long in order to handle retry for any failure @@ -552,12 +564,15 @@ def _download_data_to_bunch(url, sparse, data_home, *, return_type = _arff.DENSE_GEN frame = nominal_attributes = None + + parse_arff: Callable + postprocess: Callable if as_frame: columns = data_columns + target_columns parse_arff = partial(_convert_arff_data_dataframe, columns=columns, features_dict=features_dict) - def postprocess_frame(frame): + def postprocess(frame): X = frame[data_columns] if len(target_columns) >= 2: y = frame[target_columns] @@ -578,7 +593,7 @@ def parse_arff(arff): k in data_columns + target_columns} return X, y, nominal_attributes - def postprocess_X_y(X, y, nominal_attributes): + def postprocess(X, y, nominal_attributes): is_classification = {col_name in nominal_attributes for col_name in target_columns} if not is_classification: @@ -609,10 +624,7 @@ def postprocess_X_y(X, y, nominal_attributes): return_type=return_type, encode_nominal=not as_frame, parse_arff=parse_arff) - if as_frame: - X, y, frame, nominal_attributes = postprocess_frame(*out) - else: - X, y, frame, nominal_attributes = postprocess_X_y(*out) + X, y, frame, nominal_attributes = postprocess(*out) return Bunch(data=X, target=y, frame=frame, categories=nominal_attributes, From 14d34c489c699fde47feee40cb28adbac9326ae6 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 21:40:49 +0200 Subject: [PATCH 03/18] Fix merge conflicts --- sklearn/datasets/_openml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index c01e6719e8578..657a12505dbf5 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -572,7 +572,7 @@ def _download_data_to_bunch( parse_arff = partial(_convert_arff_data_dataframe, columns=columns, features_dict=features_dict) - def postprocess(frame): # type:ignore + def postprocess(frame): X = frame[data_columns] if len(target_columns) >= 2: y = frame[target_columns] @@ -593,7 +593,7 @@ def parse_arff(arff): k in data_columns + target_columns} return X, y, nominal_attributes - def postprocess(X, y, nominal_attributes): # type:ignore + def postprocess(X, y, nominal_attributes): is_classification = {col_name in nominal_attributes for col_name in target_columns} if not is_classification: From a31781b3db5846f6e3c425b98d7bd20de86e38ec Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 21:50:43 +0200 Subject: [PATCH 04/18] More fixes --- sklearn/datasets/_openml.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 657a12505dbf5..e3ba1b15dcc9e 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -35,8 +35,9 @@ _DATA_QUALITIES = "api/v1/json/data/qualities/{}" _DATA_FILE = "data/v1/download/{}" +OpenmlQuantileType = List[Dict[str, str]] +OpenmlFeaturesType = List[Dict[str, str]] -ArffQuantileType = List[Dict[str, str]] def _get_local_path(openml_path: str, data_home: str) -> str: @@ -445,7 +446,7 @@ def _get_data_description_by_id( def _get_data_features( data_id: int, data_home: Optional[str] -) -> List[Dict[str, str]]: +) -> OpenmlFeaturesType: # OpenML function: # https://www.openml.org/api_docs#!/data/get_data_features_id url = _DATA_FEATURES.format(data_id) @@ -460,7 +461,7 @@ def _get_data_features( def _get_data_qualities( data_id: int, data_home: Optional[str] -) -> Optional[ArffQuantileType]: +) -> Optional[OpenmlQuantileType]: # OpenML API function: # https://www.openml.org/api_docs#!/data/get_data_qualities_id url = _DATA_QUALITIES.format(data_id) @@ -478,7 +479,7 @@ def _get_data_qualities( return None -def _get_num_samples(data_qualities: Optional[ArffQuantileType]) -> int: +def _get_num_samples(data_qualities: Optional[OpenmlQuantileType]) -> int: """Get the number of samples from data qualities. Parameters From 59231e23e7a3aad46d8464bbf6adacec1548b941 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 22:08:11 +0200 Subject: [PATCH 05/18] Fixing CI --- azure-pipelines.yml | 2 ++ sklearn/externals/_arff.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3e7b46039dfc4..6cef71b03671a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -18,11 +18,13 @@ jobs: - bash: sudo chown -R $USER $CONDA displayName: Take ownership of conda installation - bash: | + conda init bash conda create --name flake8_env --yes python=3.8 conda activate flake8_env pip install flake8 mypy==0.770 displayName: Install flake8 - bash: | + set -ex if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[lint\ skip\] ]]; then # skip linting echo "Skipping linting" diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py index 6d0ed3345ca8f..de9d6e3a0d552 100644 --- a/sklearn/externals/_arff.py +++ b/sklearn/externals/_arff.py @@ -185,7 +185,6 @@ class ArffContainerType(TypedDict): else: ArffContainerType = Dict[str, Any] - def _build_re_values(): quoted_re = r''' " # open quote followed by zero or more of: From 5fa0eddf5f4179331f031605f186c7e0c422b8e3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 22:09:44 +0200 Subject: [PATCH 06/18] Lint --- sklearn/datasets/_openml.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index e3ba1b15dcc9e..bf10af738e9be 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -39,7 +39,6 @@ OpenmlFeaturesType = List[Dict[str, str]] - def _get_local_path(openml_path: str, data_home: str) -> str: return os.path.join(data_home, 'openml.org', openml_path + ".gz") From d202ecc97427702a74877a7141d79a1a743139eb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 22:17:10 +0200 Subject: [PATCH 07/18] Conda activate doesn't work --- azure-pipelines.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6cef71b03671a..c850154afb24b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -18,9 +18,9 @@ jobs: - bash: sudo chown -R $USER $CONDA displayName: Take ownership of conda installation - bash: | - conda init bash + set -ex conda create --name flake8_env --yes python=3.8 - conda activate flake8_env + source activate flake8_env pip install flake8 mypy==0.770 displayName: Install flake8 - bash: | @@ -30,7 +30,7 @@ jobs: echo "Skipping linting" exit 0 else - conda activate flake8_env + source activate flake8_env ./build_tools/circle/linting.sh fi displayName: Run linting @@ -41,7 +41,7 @@ jobs: echo "Skipping linting" exit 0 else - conda activate flake8_env + source activate flake8_env mypy sklearn/ --ignore-missing-imports fi displayName: Run mypy From e50203dd83e86d55242cadb3500c5ff5314becf5 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 22:31:29 +0200 Subject: [PATCH 08/18] Simplify exception handling --- sklearn/datasets/_openml.py | 56 +++++++++++++++---------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index bf10af738e9be..0c625c5f2eae0 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -127,12 +127,15 @@ def is_gzip(_fsrc): return gzip.GzipFile(local_path, 'rb') +class OpenMLError(Exception): + pass + + def _get_json_content_from_openml_api( url: str, error_message: Optional[str], - raise_if_error: bool, data_home: Optional[str] -) -> Optional[Dict]: +) -> Dict: """ Loads json data from the openml api @@ -146,21 +149,14 @@ def _get_json_content_from_openml_api( (acceptable error is, e.g., data id not found. Other errors, like 404's will throw the native error message) - raise_if_error : bool - Whether to raise an error if OpenML returns an acceptable error (e.g., - date not found). If this argument is set to False, a None is returned - in case of acceptable errors. Note that all other errors (e.g., 404) - will still be raised as normal. - data_home : str or None Location to cache the response. None if no cache is required. Returns ------- - json_data : json or None - the json result from the OpenML server if the call was successful; - None otherwise iff raise_if_error was set to False and the error was - ``acceptable`` + json_data : json + the json result from the OpenML server if the call was successful. + An exception otherwise. """ @_retry_with_clean_cache(url, data_home) @@ -177,9 +173,7 @@ def _load_json(): raise error # 412 error, not in except for nicer traceback - if raise_if_error: - raise ValueError(error_message) - return None + raise OpenMLError(error_message) def _split_sparse_columns( @@ -395,10 +389,8 @@ def _get_data_info_by_name( url = _SEARCH_NAME.format(name) + "/status/active/" error_msg = "No active dataset {} found.".format(name) json_data = _get_json_content_from_openml_api( - url, error_msg, raise_if_error=True, data_home=data_home + url, error_msg, data_home=data_home ) - # json_data should never be None with raise_if_error=True - assert json_data is not None res = json_data['data']['dataset'] if len(res) > 1: warn("Multiple active versions of the dataset matching the name" @@ -409,10 +401,14 @@ def _get_data_info_by_name( # an integer version has been provided url = (_SEARCH_NAME + "/data_version/{}").format(name, version) - json_data = _get_json_content_from_openml_api( - url, error_message=None, raise_if_error=False, data_home=data_home - ) - if json_data is None: + try: + json_data = _get_json_content_from_openml_api( + url, error_message=None, data_home=data_home + ) + retry = False + except OpenMLError: + retry = True + if retry: # we can do this in 1 function call if OpenML does not require the # specification of the dataset status (i.e., return datasets with a # given name / version regardless of active, deactivated, etc. ) @@ -421,10 +417,8 @@ def _get_data_info_by_name( error_msg = "Dataset {} with version {} not found.".format(name, version) json_data = _get_json_content_from_openml_api( - url, error_msg, raise_if_error=True, data_home=data_home + url, error_msg, data_home=data_home ) - # json_data should never be None with raise_if_error=True - assert json_data is not None return json_data['data']['dataset'][0] @@ -436,10 +430,8 @@ def _get_data_description_by_id( url = _DATA_INFO.format(data_id) error_message = "Dataset with data_id {} not found.".format(data_id) json_data = _get_json_content_from_openml_api( - url, error_message, raise_if_error=True, data_home=data_home + url, error_message, data_home=data_home ) - # json_data should never be None with raise_if_error=True - assert json_data is not None return json_data['data_set_description'] @@ -451,10 +443,8 @@ def _get_data_features( url = _DATA_FEATURES.format(data_id) error_message = "Dataset with data_id {} not found.".format(data_id) json_data = _get_json_content_from_openml_api( - url, error_message, raise_if_error=True, data_home=data_home + url, error_message, data_home=data_home ) - # json_data should never be None with raise_if_error=True - assert json_data is not None return json_data['data_features']['feature'] @@ -466,10 +456,8 @@ def _get_data_qualities( url = _DATA_QUALITIES.format(data_id) error_message = "Dataset with data_id {} not found.".format(data_id) json_data = _get_json_content_from_openml_api( - url, error_message, raise_if_error=True, data_home=data_home + url, error_message, data_home=data_home ) - # json_data should never be None with raise_if_error=True - assert json_data is not None try: return json_data['data_qualities']['quality'] except KeyError: From 00edef4205bcbc0e2717f5eac3b1ad120f5b666a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 22:36:44 +0200 Subject: [PATCH 09/18] Simplify data_qualities --- sklearn/datasets/_openml.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 0c625c5f2eae0..305a8fb6e67c3 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -35,7 +35,7 @@ _DATA_QUALITIES = "api/v1/json/data/qualities/{}" _DATA_FILE = "data/v1/download/{}" -OpenmlQuantileType = List[Dict[str, str]] +OpenmlQualitiesType = List[Dict[str, str]] OpenmlFeaturesType = List[Dict[str, str]] @@ -450,7 +450,7 @@ def _get_data_features( def _get_data_qualities( data_id: int, data_home: Optional[str] -) -> Optional[OpenmlQuantileType]: +) -> OpenmlQualitiesType: # OpenML API function: # https://www.openml.org/api_docs#!/data/get_data_qualities_id url = _DATA_QUALITIES.format(data_id) @@ -458,15 +458,12 @@ def _get_data_qualities( json_data = _get_json_content_from_openml_api( url, error_message, data_home=data_home ) - try: - return json_data['data_qualities']['quality'] - except KeyError: - # the qualities might not be available, but we still try to process - # the data - return None + # the qualities might not be available, but we still try to process + # the data + return json_data['data_qualities'].get('quality', []) -def _get_num_samples(data_qualities: Optional[OpenmlQuantileType]) -> int: +def _get_num_samples(data_qualities: OpenmlQualitiesType) -> int: """Get the number of samples from data qualities. Parameters @@ -483,9 +480,6 @@ def _get_num_samples(data_qualities: Optional[OpenmlQuantileType]) -> int: # If the data qualities are unavailable, we return -1 default_n_samples = -1 - if data_qualities is None: - return default_n_samples - qualities = {d['name']: d['value'] for d in data_qualities} return int(float(qualities.get('NumberOfInstances', default_n_samples))) From 937f606d9907f4bd601a8f03cd33325679cc6a63 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 22:47:09 +0200 Subject: [PATCH 10/18] Fix tests --- sklearn/datasets/_openml.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 305a8fb6e67c3..2f333d3594a4f 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -88,7 +88,7 @@ def _open_openml_url(openml_path: str, data_home: Optional[str]): result : stream A stream to the OpenML resource """ - def is_gzip(_fsrc): + def is_gzip_encoded(_fsrc): return _fsrc.info().get('Content-Encoding', '') == 'gzip' req = Request(_OPENML_PREFIX + openml_path) @@ -96,7 +96,7 @@ def is_gzip(_fsrc): if data_home is None: fsrc = urlopen(req) - if is_gzip(fsrc): + if is_gzip_encoded(fsrc): return gzip.GzipFile(fileobj=fsrc, mode='rb') return fsrc @@ -111,10 +111,10 @@ def is_gzip(_fsrc): try: with closing(urlopen(req)) as fsrc: opener: Callable - if is_gzip(fsrc): - opener = gzip.GzipFile - else: + if is_gzip_encoded(fsrc): opener = open + else: + opener = gzip.GzipFile with opener(local_path, 'wb') as fdst: shutil.copyfileobj(fsrc, fdst) except Exception: @@ -127,7 +127,8 @@ def is_gzip(_fsrc): return gzip.GzipFile(local_path, 'rb') -class OpenMLError(Exception): +class OpenMLError(ValueError): + """HTTP 412 is a specific OpenML error code, indicating a generic error""" pass @@ -460,7 +461,7 @@ def _get_data_qualities( ) # the qualities might not be available, but we still try to process # the data - return json_data['data_qualities'].get('quality', []) + return json_data.get('data_qualities', {}).get('quality', []) def _get_num_samples(data_qualities: OpenmlQualitiesType) -> int: From 55ad34b49791eae65c4cad5b00895c9fa43a7037 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 3 Jun 2020 22:29:42 +0200 Subject: [PATCH 11/18] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- sklearn/datasets/_openml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 2f333d3594a4f..edfd64923b64d 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -230,7 +230,7 @@ def _sparse_data_to_array( def _convert_arff_data( - arff: Dict[str, Any], + arff: ArffContainerType, col_slice_x: List[int], col_slice_y: List[int], shape: Optional[Tuple] = None @@ -307,7 +307,7 @@ def _feature_to_dtype(feature: Dict[str, str]): def _convert_arff_data_dataframe( - arff, columns: List, features_dict: Dict[str, Any] + arff: ArffContainerType, columns: List, features_dict: Dict[str, Any] ) -> Tuple: """Convert the ARFF object into a pandas DataFrame. From 3856d5b5dac22cc12d5d29e261b9e999f3e21969 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jun 2020 10:52:22 +0200 Subject: [PATCH 12/18] Address review comments --- .pre-commit-config.yaml | 2 +- azure-pipelines.yml | 2 +- sklearn/datasets/_openml.py | 9 ++++----- sklearn/externals/_arff.py | 6 ++++-- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index aa8df3c3cbc87..bf1bef99a5642 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,7 +14,7 @@ repos: # the code is not fully PEP8 compatible args: [--select=F401] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.730 + rev: v0.780 hooks: - id: mypy args: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index fb533177b11c6..afb67e837a5bd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -17,7 +17,7 @@ jobs: inputs: versionSpec: '3.8' - bash: | - pip install flake8 mypy==0.770 + pip install flake8 mypy==0.780 displayName: Install linters - bash: | set -ex diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 1f91925d70d73..9ac65d4f92d63 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -198,11 +198,11 @@ def _split_sparse_columns( Returns ------- - arff_data_new : tuple + arff_data_new : list Subset of arff data with only the include columns indicated by the include_columns argument. """ - arff_data_new: ArffDataType = (list(), list(), list()) + arff_data_new: ArffDataType = [list(), list(), list()] reindexed_columns = {column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)} for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]): @@ -334,6 +334,7 @@ def _convert_arff_data_dataframe( arff_columns = list(attributes) # calculate chunksize + raise ValueError first_row = next(arff['data']) first_df = pd.DataFrame([first_row], columns=arff_columns) @@ -407,10 +408,7 @@ def _get_data_info_by_name( json_data = _get_json_content_from_openml_api( url, error_message=None, data_home=data_home ) - retry = False except OpenMLError: - retry = True - if retry: # we can do this in 1 function call if OpenML does not require the # specification of the dataset status (i.e., return datasets with a # given name / version regardless of active, deactivated, etc. ) @@ -658,6 +656,7 @@ def _valid_data_column_names(features_list, target_columns): return valid_data_column_names +@_deprecate_positional_args def fetch_openml( name: Optional[str] = None, *, diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py index de9d6e3a0d552..36ac9c0022680 100644 --- a/sklearn/externals/_arff.py +++ b/sklearn/externals/_arff.py @@ -149,7 +149,7 @@ __version__ = '2.4.0' import typing -from typing import Optional, Tuple, List, Dict, Any +from typing import Optional, List, Dict, Any import re import sys @@ -172,9 +172,10 @@ _RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE) _RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]', re.UNICODE) -ArffDataType = Tuple[List, ...] +ArffDataType = List[List] if typing.TYPE_CHECKING: + # typing_extensions is only available is mypy is installed from typing_extensions import TypedDict class ArffContainerType(TypedDict): @@ -185,6 +186,7 @@ class ArffContainerType(TypedDict): else: ArffContainerType = Dict[str, Any] + def _build_re_values(): quoted_re = r''' " # open quote followed by zero or more of: From c514c5326114ee42e0130a1c9223116556ff679b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jun 2020 12:55:06 +0200 Subject: [PATCH 13/18] More fixes --- sklearn/datasets/_openml.py | 14 ++++++++------ sklearn/externals/_arff.py | 9 ++++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 9ac65d4f92d63..a4e8af9570996 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -18,7 +18,7 @@ import scipy.sparse from ..externals import _arff -from ..externals._arff import ArffDataType, ArffContainerType +from ..externals._arff import ArffSparseDataType, ArffContainerType from . import get_data_home from urllib.error import HTTPError from ..utils import Bunch @@ -179,8 +179,8 @@ def _load_json(): def _split_sparse_columns( - arff_data: ArffDataType, include_columns: List -) -> ArffDataType: + arff_data: ArffSparseDataType, include_columns: List +) -> ArffSparseDataType: """ obtains several columns from sparse arff representation. Additionally, the column indices are re-labelled, given the columns that are not included. @@ -202,7 +202,7 @@ def _split_sparse_columns( Subset of arff data with only the include columns indicated by the include_columns argument. """ - arff_data_new: ArffDataType = [list(), list(), list()] + arff_data_new: ArffSparseDataType = (list(), list(), list()) reindexed_columns = {column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)} for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]): @@ -214,7 +214,7 @@ def _split_sparse_columns( def _sparse_data_to_array( - arff_data: ArffDataType, include_columns: List + arff_data: ArffSparseDataType, include_columns: List ) -> np.ndarray: # turns the sparse data back into an array (can't use toarray() function, # as this does only work on numeric data) @@ -333,8 +333,10 @@ def _convert_arff_data_dataframe( attributes = OrderedDict(arff['attributes']) arff_columns = list(attributes) + if isinstance(arff['data'], tuple): + raise ValueError("Unreachable code. arff['data'] must be a generator.") + # calculate chunksize - raise ValueError first_row = next(arff['data']) first_df = pd.DataFrame([first_row], columns=arff_columns) diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py index 36ac9c0022680..1e4cc0fd69d21 100644 --- a/sklearn/externals/_arff.py +++ b/sklearn/externals/_arff.py @@ -149,7 +149,7 @@ __version__ = '2.4.0' import typing -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Iterator, Union, Tuple import re import sys @@ -172,7 +172,9 @@ _RE_SPARSE_LINE = re.compile(r'^\s*\{.*\}\s*$', re.UNICODE) _RE_NONTRIVIAL_DATA = re.compile('["\'{}\\s]', re.UNICODE) -ArffDataType = List[List] +ArffDenseDataType = Iterator[List] +ArffSparseDataType = Tuple[List, ...] + if typing.TYPE_CHECKING: # typing_extensions is only available is mypy is installed @@ -182,7 +184,8 @@ class ArffContainerType(TypedDict): description: str relation: str attributes: List - data: ArffDataType + data: Union[ArffDenseDataType, ArffSparseDataType] + else: ArffContainerType = Dict[str, Any] From ebf1b59520c4f82028108cc15b8a8c57869265d5 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jun 2020 13:03:06 +0200 Subject: [PATCH 14/18] Another fix --- sklearn/datasets/_openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index a4e8af9570996..a5ed71d075fa9 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -198,7 +198,7 @@ def _split_sparse_columns( Returns ------- - arff_data_new : list + arff_data_new : tuple Subset of arff data with only the include columns indicated by the include_columns argument. """ From 55ec82bcbd806a545ebb6f0b369be5e795c70a89 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jun 2020 13:05:34 +0200 Subject: [PATCH 15/18] Another typo --- sklearn/externals/_arff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py index 1e4cc0fd69d21..fa75808357321 100644 --- a/sklearn/externals/_arff.py +++ b/sklearn/externals/_arff.py @@ -177,7 +177,7 @@ if typing.TYPE_CHECKING: - # typing_extensions is only available is mypy is installed + # typing_extensions is available when mypy is installed from typing_extensions import TypedDict class ArffContainerType(TypedDict): From ebf80d1ead08df32c6eaa9cae2e6302bfbed0acd Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jun 2020 16:41:46 +0200 Subject: [PATCH 16/18] Check arff['data'] for being an iterable instead --- sklearn/datasets/_openml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index a5ed71d075fa9..01a238902dfc0 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -333,8 +333,10 @@ def _convert_arff_data_dataframe( attributes = OrderedDict(arff['attributes']) arff_columns = list(attributes) - if isinstance(arff['data'], tuple): - raise ValueError("Unreachable code. arff['data'] must be a generator.") + if not isinstance(arff['data'], Generator): + raise ValueError( + "arff['data'] must be a generator when converting to pd.DataFrame." + ) # calculate chunksize first_row = next(arff['data']) From 8c1fbfa2b1b952efc733bfffccdf7846b00fa02f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 25 Jun 2020 17:40:22 +0200 Subject: [PATCH 17/18] Add requested tests --- sklearn/datasets/tests/test_openml.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 950c208444b7d..2f935072cb351 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -14,6 +14,8 @@ from sklearn.datasets._openml import (_open_openml_url, _arff, _DATA_FILE, + _convert_arff_data, + _convert_arff_data_dataframe, _get_data_description_by_id, _get_local_path, _retry_with_clean_cache, @@ -24,6 +26,7 @@ from sklearn.utils._testing import assert_allclose, assert_array_equal from urllib.error import HTTPError from sklearn.datasets.tests.test_common import check_return_X_y +from sklearn.externals._arff import ArffContainerType from functools import partial @@ -1204,3 +1207,25 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response): # so we assert that we don't have the ignored feature in the final Bunch assert dataset['data'].shape == (101, 16) assert 'animal' not in dataset['feature_names'] + + +def test_convert_arff_data_type(): + arff: ArffContainerType = { + 'data': (el for el in range(2)), + 'description': '', + 'relation': '', + 'attributes': [] + } + msg = r"shape must be provided when arr\['data'\] is a Generator" + with pytest.raises(ValueError, match=msg): + _convert_arff_data(arff, [0], [0], shape=None) + + arff = { + 'data': list(range(2)), + 'description': '', + 'relation': '', + 'attributes': [] + } + msg = r"arff\['data'\] must be a generator when converting to pd.DataFrame" + with pytest.raises(ValueError, match=msg): + _convert_arff_data_dataframe(arff, ['a'], {}) From c9a974a5a9de26471a2fa5d1c5a2d9d939bdedaa Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 25 Jun 2020 17:57:23 +0200 Subject: [PATCH 18/18] Only run test if pandas is installed --- sklearn/datasets/tests/test_openml.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 2f935072cb351..b16a34727c2aa 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -1210,6 +1210,8 @@ def test_fetch_openml_with_ignored_feature(monkeypatch, gzip_response): def test_convert_arff_data_type(): + pytest.importorskip('pandas') + arff: ArffContainerType = { 'data': (el for el in range(2)), 'description': '',