8000 [MRG] Memory usage of OpenML fetcher: use generator from arff by jorisvandenbossche · Pull Request #13312 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] Memory usage of OpenML fetcher: use generator from arff #13312

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 56 additions & 9 deletions sklearn/datasets/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from warnings import warn
from contextlib import closing
from functools import wraps
import itertools
from collections.abc import Generator

from urllib.request import urlopen, Request

Expand All @@ -23,6 +25,7 @@
_SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2"
_DATA_INFO = "api/v1/json/data/{}"
_DATA_FEATURES = "api/v1/json/data/features/{}"
_DATA_QUALITIES = "api/v1/json/data/qualities/{}"
_DATA_FILE = "data/v1/download/{}"


Expand Down Expand Up @@ -210,7 +213,7 @@ def _sparse_data_to_array(arff_data, include_columns):
return y


def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
"""
converts the arff object into the appropriate matrix type (np.array or
scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
Expand All @@ -234,10 +237,16 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
X : np.array or scipy.sparse.csr_matrix
y : np.array
"""
if isinstance(arff_data, list):
data = np.array(arff_data, dtype=np.float64)
X = np.array(data[:, col_slice_x], dtype=np.float64)
y = np.array(data[:, col_slice_y], dtype=np.float64)
if isinstance(arff_data, Generator):
if shape[0] == -1:
count = -1
else:
count = shape[0] * shape[1]
data = np.fromiter(itertools.chain.from_iterable(arff_data),
dtype='float64', count=count)
data = data.reshape(*shape)
X = data[:, col_slice_x]
y = data[:, col_slice_y]
return X, y
elif isinstance(arff_data, tuple):
arff_data_X = _split_sparse_columns(arff_data, col_slice_x)
Expand Down Expand Up @@ -333,6 +342,34 @@ def _get_data_features(data_id, data_home):
return json_data['data_features']['feature']


def _get_data_qualities(data_id, data_home):< 8000 /span>
# OpenML API function:
# https://www.openml.org/api_docs#!/data/get_data_qualities_id
url = _DATA_QUALITIES.format(data_id)
error_message = "Dataset with data_id {} not found.".format(data_id)
json_data = _get_json_content_from_openml_api(url, error_message, True,
data_home)
try:
return json_data['data_qualities']['quality']
except KeyError:
# the qualities might not be available, but we still try to process
# the data
return None


def _get_data_shape(data_qualities):
# Using the data_info dictionary from _get_data_info_by_name to extract
# the number of samples / features
if data_qualities is None:
return None
qualities = {d['name']: d['value'] for d in data_qualities}
try:
return (int(float(qualities['NumberOfInstances'])),
int(float(qualities['NumberOfFeatures'])))
except AttributeError:
return None


def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
# Accesses an ARFF file on the OpenML server. Documentation:
# https://www.openml.org/api_data_docs#!/data/get_download_id
Expand All @@ -346,7 +383,7 @@ def _arff_load():
if sparse is True:
return_type = _arff.COO
else:
return_type = _arff.DENSE
return_type = _arff.DENSE_GEN

arff_file = _arff.loads(response.read().decode('utf-8'),
encode_nominal=encode_nominal,
Expand Down Expand Up @@ -508,7 +545,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
data_id = data_info['did']
elif data_id is not None:
# from the previous if statement, it is given that name is None
if version is not "active":
if version != "active":
raise ValueError(
"Dataset data_id={} and version={} passed, but you can only "
"specify a numeric data_id or a version, not "
Expand Down Expand Up @@ -584,18 +621,28 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
if data_description['format'].lower() == 'sparse_arff':
return_sparse = True

if not return_sparse:
data_qualities = _get_data_qualities(data_id, data_home)
shape = _get_data_shape(data_qualities)
# if the data qualities were not available, we can still get the
# n_features from the feature list, with the n_samples unknown
if shape is None:
shape = (-1, len(features_list))
else:
shape = None

# obtain the data
arff = _download_data_arff(data_description['file_id'], return_sparse,
data_home)
arff_data = arff['data']

# nominal attributes is a dict mapping from the attribute name to the
# possible values. Includes also the target column (which will be popped
# off below, before it will be packed in the Bunch object)
nominal_attributes = {k: v for k, v in arff['attributes']
if isinstance(v, list) and
k in data_columns + target_column}

X, y = _convert_arff_data(arff_data, col_slice_x, col_slice_y)
X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape)

is_classification = {col_name in nominal_attributes
for col_name in target_column}
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion sklearn/datasets/tests/test_openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def decode_column(data_bunch, col_idx):
'code relatively simple')
data_arff = _download_data_arff(data_description['file_id'],
sparse, None, False)
data_downloaded = np.array(data_arff['data'], dtype='O')
data_downloaded = np.array(list(data_arff['data']), dtype='O')

for i in range(len(data_bunch.feature_names)):
# XXX: Test per column, as this makes it easier to avoid problems with
Expand Down
0