diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index be1dec6ac1c13..b85fad127cc89 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -6,6 +6,8 @@ from warnings import warn from contextlib import closing from functools import wraps +import itertools +from collections.abc import Generator from urllib.request import urlopen, Request @@ -23,6 +25,7 @@ _SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2" _DATA_INFO = "api/v1/json/data/{}" _DATA_FEATURES = "api/v1/json/data/features/{}" +_DATA_QUALITIES = "api/v1/json/data/qualities/{}" _DATA_FILE = "data/v1/download/{}" @@ -210,7 +213,7 @@ def _sparse_data_to_array(arff_data, include_columns): return y -def _convert_arff_data(arff_data, col_slice_x, col_slice_y): +def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None): """ converts the arff object into the appropriate matrix type (np.array or scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the @@ -234,10 +237,16 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y): X : np.array or scipy.sparse.csr_matrix y : np.array """ - if isinstance(arff_data, list): - data = np.array(arff_data, dtype=np.float64) - X = np.array(data[:, col_slice_x], dtype=np.float64) - y = np.array(data[:, col_slice_y], dtype=np.float64) + if isinstance(arff_data, Generator): + if shape[0] == -1: + count = -1 + else: + count = shape[0] * shape[1] + data = np.fromiter(itertools.chain.from_iterable(arff_data), + dtype='float64', count=count) + data = data.reshape(*shape) + X = data[:, col_slice_x] + y = data[:, col_slice_y] return X, y elif isinstance(arff_data, tuple): arff_data_X = _split_sparse_columns(arff_data, col_slice_x) @@ -333,6 +342,34 @@ def _get_data_features(data_id, data_home): return json_data['data_features']['feature'] +def _get_data_qualities(data_id, data_home): + # OpenML API function: + # https://www.openml.org/api_docs#!/data/get_data_qualities_id + url = _DATA_QUALITIES.format(data_id) + error_message = "Dataset with data_id {} not found.".format(data_id) + json_data = _get_json_content_from_openml_api(url, error_message, True, + data_home) + try: + return json_data['data_qualities']['quality'] + except KeyError: + # the qualities might not be available, but we still try to process + # the data + return None + + +def _get_data_shape(data_qualities): + # Using the data_info dictionary from _get_data_info_by_name to extract + # the number of samples / features + if data_qualities is None: + return None + qualities = {d['name']: d['value'] for d in data_qualities} + try: + return (int(float(qualities['NumberOfInstances'])), + int(float(qualities['NumberOfFeatures']))) + except AttributeError: + return None + + def _download_data_arff(file_id, sparse, data_home, encode_nominal=True): # Accesses an ARFF file on the OpenML server. Documentation: # https://www.openml.org/api_data_docs#!/data/get_download_id @@ -346,7 +383,7 @@ def _arff_load(): if sparse is True: return_type = _arff.COO else: - return_type = _arff.DENSE + return_type = _arff.DENSE_GEN arff_file = _arff.loads(response.read().decode('utf-8'), encode_nominal=encode_nominal, @@ -508,7 +545,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, data_id = data_info['did'] elif data_id is not None: # from the previous if statement, it is given that name is None - if version is not "active": + if version != "active": raise ValueError( "Dataset data_id={} and version={} passed, but you can only " "specify a numeric data_id or a version, not " @@ -584,10 +621,20 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, if data_description['format'].lower() == 'sparse_arff': return_sparse = True + if not return_sparse: + data_qualities = _get_data_qualities(data_id, data_home) + shape = _get_data_shape(data_qualities) + # if the data qualities were not available, we can still get the + # n_features from the feature list, with the n_samples unknown + if shape is None: + shape = (-1, len(features_list)) + else: + shape = None + # obtain the data arff = _download_data_arff(data_description['file_id'], return_sparse, data_home) - arff_data = arff['data'] + # nominal attributes is a dict mapping from the attribute name to the # possible values. Includes also the target column (which will be popped # off below, before it will be packed in the Bunch object) @@ -595,7 +642,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, if isinstance(v, list) and k in data_columns + target_column} - X, y = _convert_arff_data(arff_data, col_slice_x, col_slice_y) + X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape) is_classification = {col_name in nominal_attributes for col_name in target_column} diff --git a/sklearn/datasets/tests/data/openml/1/api-v1-json-data-qualities-1.json.gz b/sklearn/datasets/tests/data/openml/1/api-v1-json-data-qualities-1.json.gz new file mode 100644 index 0000000000000..d89342c9bd358 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/1/api-v1-json-data-qualities-1.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-qualities-1119.json.gz b/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-qualities-1119.json.gz new file mode 100644 index 0000000000000..10ae7e4791847 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/1119/api-v1-json-data-qualities-1119.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/2/api-v1-json-data-qualities-2.json.gz b/sklearn/datasets/tests/data/openml/2/api-v1-json-data-qualities-2.json.gz new file mode 100644 index 0000000000000..c1abe1f9c6b81 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/2/api-v1-json-data-qualities-2.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/3/api-v1-json-data-qualities-3.json.gz b/sklearn/datasets/tests/data/openml/3/api-v1-json-data-qualities-3.json.gz new file mode 100644 index 0000000000000..46775cf9fec52 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/3/api-v1-json-data-qualities-3.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-qualities-40589.json.gz b/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-qualities-40589.json.gz new file mode 100644 index 0000000000000..b019974c965c4 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40589/api-v1-json-data-qualities-40589.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-qualities-40675.json.gz b/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-qualities-40675.json.gz new file mode 100644 index 0000000000000..74953820e70b7 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40675/api-v1-json-data-qualities-40675.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-qualities-40966.json.gz b/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-qualities-40966.json.gz new file mode 100644 index 0000000000000..f030c25582749 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/40966/api-v1-json-data-qualities-40966.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/561/api-v1-json-data-qualities-561.json.gz b/sklearn/datasets/tests/data/openml/561/api-v1-json-data-qualities-561.json.gz new file mode 100644 index 0000000000000..bacdb6e6c1c42 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/561/api-v1-json-data-qualities-561.json.gz differ diff --git a/sklearn/datasets/tests/data/openml/61/api-v1-json-data-qualities-61.json.gz b/sklearn/datasets/tests/data/openml/61/api-v1-json-data-qualities-61.json.gz new file mode 100644 index 0000000000000..463b09d386eb4 Binary files /dev/null and b/sklearn/datasets/tests/data/openml/61/api-v1-json-data-qualities-61.json.gz differ diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 84f87297c13b1..9c8200731aa6d 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -55,7 +55,7 @@ def decode_column(data_bunch, col_idx): 'code relatively simple') data_arff = _download_data_arff(data_description['file_id'], sparse, None, False) - data_downloaded = np.array(data_arff['data'], dtype='O') + data_downloaded = np.array(list(data_arff['data']), dtype='O') for i in range(len(data_bunch.feature_names)): # XXX: Test per column, as this makes it easier to avoid problems with