8000 [MRG] Adds the ability load datasets from OpenML containing string attributes by oanise93 · Pull Request #13177 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

[MRG] Adds the ability load datasets from OpenML containing string attributes #13177

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions sklearn/datasets/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,11 +242,10 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
count = -1
else:
count = shape[0] * shape[1]
data = np.fromiter(itertools.chain.from_iterable(arff_data),
dtype='float64', count=count)
data = np.array(list(itertools.chain.from_iterable(arff_data)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that this change will reverse the enhancement from #13312.
@jorisvandenbossche Do you know what would be best here. It seems that we make our own fromiter which will select the column of X and y when iterating over the iterator. We should know the size in advance as well, isn't it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this reverses the recent enhancement, to use fromiter. Thinking about it, this change is somewhat messier with fromiter. Not sure what the right solution is. The purpose of fromiter was to avoid materialising all the python objects, which was consuming a lot of temporary memory. But there are other ways to chunk the reading, I suppose.

The bigger problem here is that we're temporarily converting all numeric values to strings, which undoes the conversion work in the arff library.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect that we will need to treat string columns separately before this, with an operation like:

string_col_idxs = ...  # a list of string column indices
if string_col_idxs:
    numeric_slices = [slice(start, stop) for start, stop in zip([None] + string_col_idxs, string_col_idxs + [None])]
    arff_data = (row[sl] for row in arff_data for sl in numeric_slices)

Though this may all be essentially replicating functionality in Pandas, so if we just supported pandas output, we could avoid this.

data = data.reshape(*shape)
X = data[:, col_slice_x]
y = data[:, col_slice_y]
X = np.array(data[:, col_slice_x], dtype=np.float64)
y = np.array(data[:, col_slice_y], dtype=np.float64)
return X, y
elif isinstance(arff_data, tuple):
arff_data_X = _split_sparse_columns(arff_data, col_slice_x)
Expand Down Expand Up @@ -287,7 +286,7 @@ def _get_data_info_by_name(name, version, data_home):
Returns
-------
first_dataset : json
json representation of the first dataset object that adhired to the
json representation of the first dataset object that adhered to the
search criteria

"""
Expand Down Expand Up @@ -436,7 +435,8 @@ def _valid_data_column_names(features_list, target_columns):


def fetch_openml(name=None, version='active', data_id=None, data_home=None,
target_column='default-target', cache=True, return_X_y=False):
ignore_strings=False, target_column='default-target',
cache=True, return_X_y=False):
"""Fetch dataset from openml by name or dataset id.

Datasets are uniquely identified by either an integer ID or by a
Expand All @@ -450,7 +450,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
.. note:: EXPERIMENTAL

The API is experimental in version 0.20 (particularly the return value
structure), and might have small backward-incompatible changes in
structure), and might have small backward-incompatble changes in
future releases.

Parameters
Expand All @@ -475,6 +475,9 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
Specify another download and cache folder for the data sets. By default
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

ignore_strings : boolean, default=False
Whether to ignore string attributes when loading a dataset.

target_column : string, list or None, default 'default-target'
Specify the column name in the data to use as target. If
'default-target', the standard target column a stored on the server
Expand Down Expand Up @@ -573,11 +576,22 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
# download data features, meta-info about column types
features_list = _get_data_features(data_id, data_home)

if ignore_strings:
string_features = [f for f in features_list
if f['data_type'] == 'string']
if string_features:
string_feature_names = [f['name'] for f in string_features]
features_list = [f for f in features_list if f['name'] not in
string_feature_names]

for feature in features_list:
if 'true' in (feature['is_ignore'], feature['is_row_identifier']):
continue
if feature['data_type'] == 'string':
raise ValueError('STRING attributes are not yet supported')
if feature['data_type'] == 'string' and not ignore_strings:
raise ValueError('STRING attributes are not yet supported.'
'If you would like to return the data '
'without STRING attributes, Use'
'ignore_strings=True')

if target_column == "default-target":
# determines the default target based on the data feature results
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
72 changes: 58 additions & 14 deletions sklearn/datasets/tests/test_openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def decode_column(data_bunch, col_idx):


def _fetch_dataset_from_openml(data_id, data_name, data_version,
target_column,
ignore_strings, target_column,
expected_observations, expected_features,
expected_missing,
expected_data_dtype, expected_target_dtype,
Expand All @@ -76,17 +76,18 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
# result. Note that this function can be mocked (by invoking
# _monkey_patch_webbased_functions before invoking this function)
data_by_name_id = fetch_openml(name=data_name, version=data_version,
cache=False)
ignore_strings=ignore_strings, cache=False)
assert int(data_by_name_id.details['id']) == data_id

# Please note that cache=False is crucial, as the monkey patched files are
# not consistent with reality
fetch_openml(name=data_name, cache=False)
fetch_openml(name=data_name, ignore_strings=ignore_strings, cache=False)
# without specifying the version, there is no guarantee that the data id
# will be the same

# fetch with dataset id
data_by_id = fetch_openml(data_id=data_id, cache=False,
ignore_strings=ignore_strings,
target_column=target_column)
assert data_by_id.details['name'] == data_name
assert data_by_id.data.shape == (expected_observations, expected_features)
Expand All @@ -112,7 +113,9 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,

if compare_default_target:
# check whether the data by id and data by id target are equal
data_by_id_default = fetch_openml(data_id=data_id, cache=False)
data_by_id_default = fetch_openml(data_id=data_id,
ignore_strings=ignore_strings,
cache=False)
if data_by_id.data.dtype == np.float64:
np.testing.assert_allclose(data_by_id.data,
data_by_id_default.data)
Expand All @@ -133,8 +136,9 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
expected_missing)

# test return_X_y option
fetch_func = partial(fetch_openml, data_id=data_id, cache=False,
target_column=target_column)
fetch_func = partial(fetch_openml, data_id=data_id,
ignore_strings=ignore_strings,
cache=False, target_column=target_column)
check_return_X_y(data_by_id, fetch_func)
return data_by_id

Expand Down Expand Up @@ -261,6 +265,7 @@ def test_fetch_openml_iris(monkeypatch, gzip_response):
data_id = 61
data_name = 'iris'
data_version = 1
ignore_strings = False
target_column = 'class'
expected_observations = 150
expected_features = 4
Expand All @@ -275,6 +280,7 @@ def test_fetch_openml_iris(monkeypatch, gzip_response):
_fetch_dataset_from_openml,
**{'data_id': data_id, 'data_name': data_name,
'data_version': data_version,
'ignore_strings': ignore_strings,
'target_column': target_column,
'expected_observations': expected_observations,
'expected_features': expected_features,
Expand All @@ -298,13 +304,15 @@ def test_fetch_openml_iris_multitarget(monkeypatch, gzip_response):
data_id = 61
data_name = 'iris'
data_version = 1
ignore_strings = False
target_column = ['sepallength', 'sepalwidth']
expected_observations = 150
expected_features = 3
expected_missing = 0

_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
_fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
_fetch_dataset_from_openml(data_id, data_name, data_version,
ignore_strings, target_column,
expected_observations, expected_features,
expected_missing,
object, np.float64, expect_sparse=False,
Expand All @@ -317,13 +325,15 @@ def test_fetch_openml_anneal(monkeypatch, gzip_response):
data_id = 2
data_name = 'anneal'
data_version = 1
ignore_strings = False
target_column = 'class'
# Not all original instances included for space reasons
expected_observations = 11
expected_features = 38
expected_missing = 267
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
_fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
_fetch_dataset_from_openml(data_id, data_name, data_version,
ignore_strings, target_column,
expected_observations, expected_features,
expected_missing,
object, object, expect_sparse=False,
Expand All @@ -342,13 +352,15 @@ def test_fetch_openml_anneal_multitarget(monkeypatch, gzip_response):
data_id = 2
data_name = 'anneal'
data_version = 1
ignore_strings = False
target_column = ['class', 'product-type', 'shape']
# Not all original instances included for space reasons
expected_observations = 11
expected_features = 36
expected_missing = 267
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
_fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
_fetch_dataset_from_openml(data_id, data_name, data_version,
ignore_strings, target_column,
expected_observations, expected_features,
expected_missing,
object, object, expect_sparse=False,
Expand All @@ -361,12 +373,14 @@ def test_fetch_openml_cpu(monkeypatch, gzip_response):
data_id = 561
data_name = 'cpu'
data_version = 1
ignore_strings = False
target_column = 'class'
expected_observations = 209
expected_features = 7
expected_missing = 0
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
_fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
_fetch_dataset_from_openml(data_id, data_name, data_version,
ignore_strings, target_column,
expected_observations, expected_features,
expected_missing,
object, np.float64, expect_sparse=False,
Expand All @@ -388,6 +402,7 @@ def test_fetch_openml_australian(monkeypatch, gzip_response):
data_id = 292
data_name = 'Australian'
data_version = 1
ignore_strings = False
target_column = 'Y'
# Not all original instances included for space reasons
expected_observations = 85
Expand All @@ -400,6 +415,7 @@ def test_fetch_openml_australian(monkeypatch, gzip_response):
_fetch_dataset_from_openml,
**{'data_id': data_id, 'data_name': data_name,
'data_version': data_version,
'ignore_strings': ignore_strings,
'target_column': target_column,
'expected_observations': expected_observations,
'expected_features': expected_features,
Expand All @@ -417,13 +433,15 @@ def test_fetch_openml_adultcensus(monkeypatch, gzip_response):
data_id = 1119
data_name = 'adult-census'
data_version = 1
ignore_strings = False
target_column = 'class'
# Not all original instances included for space reasons
expected_observations = 10
expected_features = 14
expected_missing = 0
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
_fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
_fetch_dataset_from_openml(data_id, data_name, data_version,
ignore_strings, target_column,
expected_observations, expected_features,
expected_missing,
np.float64, object, expect_sparse=False,
Expand All @@ -439,13 +457,15 @@ def test_fetch_openml_miceprotein(monkeypatch, gzip_response):
data_id = 40966
data_name = 'MiceProtein'
data_version = 4
ignore_strings = False
target_column = 'class'
# Not all original instances included for space reasons
expected_observations = 7
expected_features = 77
expected_missing = 7
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
_fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
_fetch_dataset_from_openml(data_id, data_name, data_version,
ignore_strings, target_column,
expected_observations, expected_features,
expected_missing,
np.float64, object, expect_sparse=False,
Expand All @@ -458,14 +478,16 @@ def test_fetch_openml_emotions(monkeypatch, gzip_response):
data_id = 40589
data_name = 'emotions'
data_version = 3
ignore_strings = False
target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm',
'quiet.still', 'sad.lonely', 'angry.aggresive']
expected_observations = 13
expected_features = 72
expected_missing = 0
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)

_fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
_fetch_dataset_from_openml(data_id, data_name, data_version,
ignore_strings, target_column,
expected_observations, expected_features,
expected_missing,
np.float64, object, expect_sparse=False,
Expand All @@ -478,6 +500,27 @@ def test_decode_emotions(monkeypatch):
_test_features_list(data_id)


@pytest.mark.parametrize('gzip_response', [True, False])
def test_fetch_titanic(monkeypatch, gzip_response):
# check because of the string attributes
data_id = 40945
data_name = 'Titanic'
data_version = 1
ignore_strings = True
target_column = 'survived'
# Not all original features included because five are strings
expected_observations = 1309
expected_features = 8
expected_missing = 1454
_monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
_fetch_dataset_from_openml(data_id, data_name, data_version,
ignore_strings, target_column,
expected_observations, expected_features,
expected_missing,
np.float64, object, expect_sparse=False,
compare_default_target=True)


@pytest.mark.parametrize('gzip_response', [True, False])
def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
data_id = 61
Expand Down Expand Up @@ -667,7 +710,8 @@ def test_string_attribute(monkeypatch, gzip_response):
# single column test
assert_raise_message(ValueError,
'STRING attributes are not yet supported',
fetch_openml, data_id=data_id, cache=False)
fetch_openml, data_id=data_id, ignore_strings=False,
cache=False)


@pytest.mark.parametrize('gzip_response', [True, False])
Expand Down
0