8000 Revert "MNT Memory usage of OpenML fetcher: use generator from arff (… · xhluca/scikit-learn@e9b62e3 · GitHub
[go: up one dir, main page]

Skip to content

Commit e9b62e3

Browse files
author
Xing
committed
Revert "MNT Memory usage of OpenML fetcher: use generator from arff (scikit-learn#13312)"
This reverts commit 024c9ba.
1 parent 2daec5a commit e9b62e3

11 files changed

+10
-57
lines changed

sklearn/datasets/openml.py

Lines changed: 9 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
from warnings import warn
77
from contextlib import closing
88
from functools import wraps
9-
import itertools
10-
from collections.abc import Generator
119

1210
from urllib.request import urlopen, Request
1311

@@ -25,7 +23,6 @@
2523
_SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2"
2624
_DATA_INFO = "api/v1/json/data/{}"
2725
_DATA_FEATURES = "api/v1/json/data/features/{}"
28-
_DATA_QUALITIES = "api/v1/json/data/qualities/{}"
2926
_DATA_FILE = "data/v1/download/{}"
3027

3128

@@ -213,7 +210,7 @@ def _sparse_data_to_array(arff_data, include_columns):
213210
return y
214211

215212

216-
def _convert_arff_data( 8000 arff_data, col_slice_x, col_slice_y, shape=None):
213+
def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
217214
"""
218215
converts the arff object into the appropriate matrix type (np.array or
219216
scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
@@ -237,16 +234,10 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
237234
X : np.array or scipy.sparse.csr_matrix
238235
y : np.array
239236
"""
240-
if isinstance(arff_data, Generator):
241-
if shape[0] == -1:
242-
count = -1
243-
else:
244-
count = shape[0] * shape[1]
245-
data = np.fromiter(itertools.chain.from_iterable(arff_data),
246-
dtype='float64', count=count)
247-
data = data.reshape(*shape)
248-
X = data[:, col_slice_x]
249-
y = data[:, col_slice_y]
237+
if isinstance(arff_data, list):
238+
data = np.array(arff_data, dtype=np.float64)
239+
X = np.array(data[:, col_slice_x], dtype=np.float64)
240+
y = np.array(data[:, col_slice_y], dtype=np.float64)
250241
return X, y
251242
elif isinstance(arff_data, tuple):
252243
arff_data_X = _split_sparse_columns(arff_data, col_slice_x)
@@ -342,34 +333,6 @@ def _get_data_features(data_id, data_home):
342333
return json_data['data_features']['feature']
343334

344335

345-
def _get_data_qualities(data_id, data_home):
346-
# OpenML API function:
347-
# https://www.openml.org/api_docs#!/data/get_data_qualities_id
348-
url = _DATA_QUALITIES.format(data_id)
349-
error_message = "Dataset with data_id {} not found.".format(data_id)
350-
json_data = _get_json_content_from_openml_api(url, error_message, True,
351-
data_home)
352-
try:
353-
return json_data['data_qualities']['quality']
354-
except KeyError:
355-
# the qualities might not be available, but we still try to process
356-
# the data
357-
return None
358-
359-
360-
def _get_data_shape(data_qualities):
361-
# Using the data_info dictionary from _get_data_info_by_name to extract
362-
# the number of samples / features
363-
if data_qualities is None:
364-
return None
365-
qualities = {d['name']: d['value'] for d in data_qualities}
366-
try:
367-
return (int(float(qualities['NumberOfInstances'])),
368-
int(float(qualities['NumberOfFeatures'])))
369-
except AttributeError:
370-
return None
371-
372-
373336
def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
374337
# Accesses an ARFF file on the OpenML server. Documentation:
375338
# https://www.openml.org/api_data_docs#!/data/get_download_id
@@ -383,7 +346,7 @@ def _arff_load():
383346
if sparse is True:
384347
return_type = _arff.COO
385348
else:
386-
return_type = _arff.DENSE_GEN
349+
return_type = _arff.DENSE
387350

388351
arff_file = _arff.loads(response.read().decode('utf-8'),
389352
encode_nominal=encode_nominal,
@@ -545,7 +508,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
545508
data_id = data_info['did']
546509
elif data_id is not None:
547510
# from the previous if statement, it is given that name is None
548-
if version != "active":
511+
if version is not "active":
549512
raise ValueError(
550513
"Dataset data_id={} and version={} passed, but you can only "
551514
"specify a numeric data_id or a version, not "
@@ -621,28 +584,18 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
621584
if data_description['format'].lower() == 'sparse_arff':
622585
return_sparse = True
623586

624-
if not return_sparse:
625-
data_qualities = _get_data_qualities(data_id, data_home)
626-
shape = _get_data_shape(data_qualities)
627-
# if the data qualities were not available, we can still get the
628-
# n_features from the feature list, with the n_samples unknown
629-
if shape is None:
630-
shape = (-1, len(features_list))
631-
else:
632-
shape = None
633-
634587
# obtain the data
635588
arff = _download_data_arff(data_description['file_id'], return_sparse,
636589
data_home)
637-
590+
arff_data = arff['data']
638591
# nominal attributes is a dict mapping from the attribute name to the
639592
# possible values. Includes also the target column (which will be popped
640593
# off below, before it will be packed in the Bunch object)
641594
nominal_attributes = {k: v for k, v in arff['attributes']
642595
if isinstance(v, list) and
643596
k in data_columns + target_column}
644597

645-
X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape)
598+
X, y = _convert_arff_data(arff_data, col_slice_x, col_slice_y)
646599

647600
is_classification = {col_name in nominal_attributes
648601
for col_name in target_column}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)
0