8000 MNT Memory usage of OpenML fetcher: use generator from arff (#13312) · xhluca/scikit-learn@024c9ba · GitHub
[go: up one dir, main page]

Skip to content

Commit 024c9ba

Browse files
jorisvandenbosscheXing
authored andcommitted
MNT Memory usage of OpenML fetcher: use generator from arff (scikit-learn#13312)
* Memory usage of OpenML fetcher: use generator from arff * fix actually getting data qualities in all cases * Add qualities responses * add workaround for cases where data qualities are not available * feedback joel
1 parent 75614ef commit 024c9ba

11 files changed

+57
-10
lines changed

sklearn/datasets/openml.py

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from warnings import warn
77
from contextlib import closing
88
from functools import wraps
9+
import itertools
10+
from collections.abc import Generator
911

1012
from urllib.request import urlopen, Request
1113

@@ -23,6 +25,7 @@
2325
_SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2"
2426
_DATA_INFO = "api/v1/json/data/{}"
2527
_DATA_FEATURES = "api/v1/json/data/features/{}"
28+
_DATA_QUALITIES = "api/v1/json/data/qualities/{}"
2629
_DATA_FILE = "data/v1/download/{}"
2730

2831

@@ -210,7 +213,7 @@ def _sparse_data_to_array(arff_data, include_columns):
210213
return y
211214

212215

213-
def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
216+
def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
214217
"""
215218
converts the arff object into the appropriate matrix type (np.array or
216219
scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
@@ -234,10 +237,16 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
234237
X : np.array or scipy.sparse.csr_matrix
235238
y : np.array
236239
"""
237-
if isinstance(arff_data, list):
238-
data = np.array(arff_data, dtype=np.float64)
239-
X = np.array(data[:, col_slice_x], dtype=np.float64)
240-
y = np.array(data[:, col_slice_y], dtype=np.float64)
240+
if isinstance(arff_data, Generator):
241+
if shape[0] == -1:
242+
count = -1
243+
else:
244+
count = shape[0] * shape[1]
245+
data = np.fromiter(itertools.chain.from_iterable(arff_data),
246+
dtype='float64', count=count)
247+
data = data.reshape(*shape)
248+
X = data[:, col_slice_x]
249+
y = data[:, col_slice_y]
241250
return X, y
242251
elif isinstance(arff_data, tuple):
243252
arff_data_X = _split_sparse_columns(arff_data, col_slice_x)
@@ -333,6 +342,34 @@ def _get_data_features(data_id, data_home):
333342
return json_data['data_features']['feature']
334343

335344

345+
def _get_data_qualities(data_id, data_home):
346+
# OpenML API function:
347+
# https://www.openml.org/api_docs#!/data/get_data_qualities_id
348+
url = _DATA_QUALITIES.format(data_id)
349+
error_message = "Dataset with data_id {} not found.".format(data_id)
350+
json_data = _get_json_content_from_openml_api(url, error_message, True,
351+
data_home)
352+
try:
353+
return json_data['data_qualities']['quality']
354+
except KeyError:
355+
# the qualities might not be available, but we still try to process
356+
# the data
357+
return None
358+
359+
360+
def _get_data_shape(data_qualities):
361+
# Using the data_info dictionary from _get_data_info_by_name to extract
362+
# the number of samples / features
363+
if data_qualities is None:
364+
return None
365+
qualities = {d['name']: d['value'] for d in data_qualities}
366+
try:
367+
return (int(float(qualities['NumberOfInstances'])),
368+
int(float(qualities['NumberOfFeatures'])))
369+
except AttributeError:
370+
return None
371+
372+
336373
def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
337374
# Accesses an ARFF file on the OpenML server. Documentation:
338375
# https://www.openml.org/api_data_docs#!/data/get_download_id
@@ -346,7 +383,7 @@ def _arff_load():
346383
if sparse is True:
347384
return_type = _arff.COO
348385
else:
349-
return_type = _arff.DENSE
386+
return_type = _arff.DENSE_GEN
350387

351388
arff_file = _arff.loads(response.read().decode('utf-8'),
352389
encode_nominal=encode_nominal,
@@ -508,7 +545,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
508545
data_id = data_info['did']
509546
elif data_id is not None:
510547
# from the previous if statement, it is given that name is None
511-
if version is not "active":
548+
if version != "active":
512549
raise ValueError(
513550
"Dataset data_id={} and version={} passed, but you can only "
514551
"specify a numeric data_id or a version, not "
@@ -584,18 +621,28 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
584621
if data_description['format'].lower() == 'sparse_arff':
585622
return_sparse = True
586623

624+
if not return_sparse:
625+
data_qualities = _get_data_qualities(data_id, data_home)
626+
shape = _get_data_shape(data_qualities)
627+
# if the data qualities were not available, we can still get the
628+
# n_features from the feature list, with the n_samples unknown
629+
if shape is None:
630+
shape = (-1, len(features_list))
631+
else:
632+
shape = None
633+
587634
# obtain the data
588635
arff = _download_data_arff(data_description['file_id'], return_sparse,
589636
data_home)
590-
arff_data = arff['data']
637+
591638
# nominal attributes is a dict mapping from the attribute name to the
592639
# possible values. Includes also the target column (which will be popped
593640
# off below, before it will be packed in the Bunch object)
594641
nominal_attributes = {k: v for k, v in arff['attributes']
595642
if isinstance(v, list) and
596643
k in data_columns + target_column}
597644

598-
X, y = _convert_arff_data(arff_data, col_slice_x, col_slice_y)
645+
X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape)
599646

600647
is_classification = {col_name in nominal_attributes
601648
for col_name in target_column}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)
0