scikit-learn
diff --git a/‎sklearn/datasets/openml.py
Lines changed: 23 additions & 9 deletions b/‎sklearn/datasets/openml.py
Lines changed: 23 additions & 9 deletions
diff --git a/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-40704.json.gz
418 Bytes b/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-40704.json.gz
418 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-features-40704.json.gz
264 Bytes b/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-features-40704.json.gz
264 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-list-data_name-titanic-limit-2-data_version-1.json.gz
378 Bytes b/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-list-data_name-titanic-limit-2-data_version-1.json.gz
378 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-list-data_name-titanic-limit-2-status-active-.json.gz
446 Bytes b/‎sklearn/datasets/tests/data/openml/40945/api-v1-json-data-list-data_name-titanic-limit-2-status-active-.json.gz
446 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40945/data-v1-download-16826755.arff.gz
31.6 KB b/‎sklearn/datasets/tests/data/openml/40945/data-v1-download-16826755.arff.gz
31.6 KB
diff --git a/‎sklearn/datasets/tests/data/openml/40945/data-v1-download-4965305.arff.gz
2.69 KB b/‎sklearn/datasets/tests/data/openml/40945/data-v1-download-4965305.arff.gz
2.69 KB
diff --git a/‎sklearn/datasets/tests/test_openml.py
Lines changed: 58 additions & 14 deletions b/‎sklearn/datasets/tests/test_openml.py
Lines changed: 58 additions & 14 deletions
@@ -242,11 +242,10 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
             count = -1
         else:
             count = shape[0] * shape[1]
-        data = np.fromiter(itertools.chain.from_iterable(arff_data),
-                           dtype='float64', count=count)
+        data = np.array(list(itertools.chain.from_iterable(arff_data)))
         data = data.reshape(*shape)
-        X = data[:, col_slice_x]
-        y = data[:, col_slice_y]
+        X = np.array(data[:, col_slice_x], dtype=np.float64)
+        y = np.array(data[:, col_slice_y], dtype=np.float64)
         return X, y
     elif isinstance(arff_data, tuple):
         arff_data_X = _split_sparse_columns(arff_data, col_slice_x)
@@ -287,7 +286,7 @@ def _get_data_info_by_name(name, version, data_home):
     Returns
     -------
     first_dataset : json
-        json representation of the first dataset object that adhired to the
+        json representation of the first dataset object that adhered to the
         search criteria
 
     """
@@ -436,7 +435,8 @@ def _valid_data_column_names(features_list, target_columns):
 
 
 def fetch_openml(name=None, version='active', data_id=None, data_home=None,
-                 target_column='default-target', cache=True, return_X_y=False):
+                 ignore_strings=False, target_column='default-target',
+                 cache=True, return_X_y=False):
     """Fetch dataset from openml by name or dataset id.
 
     Datasets are uniquely identified by either an integer ID or by a
@@ -450,7 +450,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
     .. note:: EXPERIMENTAL
 
         The API is experimental in version 0.20 (particularly the return value
-        structure), and might have small backward-incompatible changes in
+        structure), and might have small backward-incompatble changes in
         future releases.
 
     Parameters
@@ -475,6 +475,9 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
         Specify another download and cache folder for the data sets. By default
         all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
 
+    ignore_strings : boolean, default=False
+        Whether to ignore string attributes when loading a dataset.
+
     target_column : string, list or None, default 'default-target'
         Specify the column name in the data to use as target. If
         'default-target', the standard target column a stored on the server
@@ -573,11 +576,22 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
     # download data features, meta-info about column types
     features_list = _get_data_features(data_id, data_home)
 
+    if ignore_strings:
+        string_features = [f for f in features_list
+                           if f['data_type'] == 'string']
+        if string_features:
+            string_feature_names = [f['name'] for f in string_features]
+            features_list = [f for f in features_list if f['name'] not in
+                             string_feature_names]
+
     for feature in features_list:
         if 'true' in (feature['is_ignore'], feature['is_row_identifier']):
             continue
-        if feature['data_type'] == 'string':
-            raise ValueError('STRING attributes are not yet supported')
+        if feature['data_type'] == 'string' and not ignore_strings:
+            raise ValueError('STRING attributes are not yet supported.'
+                             'If you would like to return the data '
+                             'without STRING attributes, Use'
+                             'ignore_strings=True')
 
     if target_column == "default-target":
         # determines the default target based on the data feature results
 
@@ -66,7 +66,7 @@ def decode_column(data_bunch, col_idx):
 
 
 def _fetch_dataset_from_openml(data_id, data_name, data_version,
-                               target_column,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                expected_data_dtype, expected_target_dtype,
@@ -76,17 +76,18 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
     # result. Note that this function can be mocked (by invoking
     # _monkey_patch_webbased_functions before invoking this function)
     data_by_name_id = fetch_openml(name=data_name, version=data_version,
-                                   cache=False)
+                                   ignore_strings=ignore_strings, cache=False)
     assert int(data_by_name_id.details['id']) == data_id
 
     # Please note that cache=False is crucial, as the monkey patched files are
     # not consistent with reality
-    fetch_openml(name=data_name, cache=False)
+    fetch_openml(name=data_name, ignore_strings=ignore_strings, cache=False)
     # without specifying the version, there is no guarantee that the data id
     # will be the same
 
     # fetch with dataset id
     data_by_id = fetch_openml(data_id=data_id, cache=False,
+                              ignore_strings=ignore_strings,
                               target_column=target_column)
     assert data_by_id.details['name'] == data_name
     assert data_by_id.data.shape == (expected_observations, expected_features)
@@ -112,7 +113,9 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
 
     if compare_default_target:
         # check whether the data by id and data by id target are equal
-        data_by_id_default = fetch_openml(data_id=data_id, cache=False)
+        data_by_id_default = fetch_openml(data_id=data_id,
+                                          ignore_strings=ignore_strings,
+                                          cache=False)
         if data_by_id.data.dtype == np.float64:
             np.testing.assert_allclose(data_by_id.data,
                                        data_by_id_default.data)
@@ -133,8 +136,9 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
                 expected_missing)
 
     # test return_X_y option
-    fetch_func = partial(fetch_openml, data_id=data_id, cache=False,
-                         target_column=target_column)
+    fetch_func = partial(fetch_openml, data_id=data_id,
+                         ignore_strings=ignore_strings,
+                         cache=False, target_column=target_column)
     check_return_X_y(data_by_id, fetch_func)
     return data_by_id
 
@@ -261,6 +265,7 @@ def test_fetch_openml_iris(monkeypatch, gzip_response):
     data_id = 61
     data_name = 'iris'
     data_version = 1
+    ignore_strings = False
     target_column = 'class'
     expected_observations = 150
     expected_features = 4
@@ -275,6 +280,7 @@ def test_fetch_openml_iris(monkeypatch, gzip_response):
         _fetch_dataset_from_openml,
         **{'data_id': data_id, 'data_name': data_name,
            'data_version': data_version,
+           'ignore_strings': ignore_strings,
            'target_column': target_column,
            'expected_observations': expected_observations,
            'expected_features': expected_features,
@@ -298,13 +304,15 @@ def test_fetch_openml_iris_multitarget(monkeypatch, gzip_response):
     data_id = 61
     data_name = 'iris'
     data_version = 1
+    ignore_strings = False
     target_column = ['sepallength', 'sepalwidth']
     expected_observations = 150
     expected_features = 3
     expected_missing = 0
 
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                object, np.float64, expect_sparse=False,
@@ -317,13 +325,15 @@ def test_fetch_openml_anneal(monkeypatch, gzip_response):
     data_id = 2
     data_name = 'anneal'
     data_version = 1
+    ignore_strings = False
     target_column = 'class'
     # Not all original instances included for space reasons
     expected_observations = 11
     expected_features = 38
     expected_missing = 267
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                object, object, expect_sparse=False,
@@ -342,13 +352,15 @@ def test_fetch_openml_anneal_multitarget(monkeypatch, gzip_response):
     data_id = 2
     data_name = 'anneal'
     data_version = 1
+    ignore_strings = False
     target_column = ['class', 'product-type', 'shape']
     # Not all original instances included for space reasons
     expected_observations = 11
     expected_features = 36
     expected_missing = 267
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                object, object, expect_sparse=False,
@@ -361,12 +373,14 @@ def test_fetch_openml_cpu(monkeypatch, gzip_response):
     data_id = 561
     data_name = 'cpu'
     data_version = 1
+    ignore_strings = False
     target_column = 'class'
     expected_observations = 209
     expected_features = 7
     expected_missing = 0
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                object, np.float64, expect_sparse=False,
@@ -388,6 +402,7 @@ def test_fetch_openml_australian(monkeypatch, gzip_response):
     data_id = 292
     data_name = 'Australian'
     data_version = 1
+    ignore_strings = False
     target_column = 'Y'
     # Not all original instances included for space reasons
     expected_observations = 85
@@ -400,6 +415,7 @@ def test_fetch_openml_australian(monkeypatch, gzip_response):
         _fetch_dataset_from_openml,
         **{'data_id': data_id, 'data_name': data_name,
            'data_version': data_version,
+           'ignore_strings': ignore_strings,
            'target_column': target_column,
            'expected_observations': expected_observations,
            'expected_features': expected_features,
@@ -417,13 +433,15 @@ def test_fetch_openml_adultcensus(monkeypatch, gzip_response):
     data_id = 1119
     data_name = 'adult-census'
     data_version = 1
+    ignore_strings = False
     target_column = 'class'
     # Not all original instances included for space reasons
     expected_observations = 10
     expected_features = 14
     expected_missing = 0
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                np.float64, object, expect_sparse=False,
@@ -439,13 +457,15 @@ def test_fetch_openml_miceprotein(monkeypatch, gzip_response):
     data_id = 40966
     data_name = 'MiceProtein'
     data_version = 4
+    ignore_strings = False
     target_column = 'class'
     # Not all original instances included for space reasons
     expected_observations = 7
     expected_features = 77
     expected_missing = 7
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                np.float64, object, expect_sparse=False,
@@ -458,14 +478,16 @@ def test_fetch_openml_emotions(monkeypatch, gzip_response):
     data_id = 40589
     data_name = 'emotions'
     data_version = 3
+    ignore_strings = False
     target_column = ['amazed.suprised', 'happy.pleased', 'relaxing.calm',
                      'quiet.still', 'sad.lonely', 'angry.aggresive']
     expected_observations = 13
     expected_features = 72
     expected_missing = 0
     _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
 
-    _fetch_dataset_from_openml(data_id, data_name, data_version, target_column,
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
                                expected_observations, expected_features,
                                expected_missing,
                                np.float64, object, expect_sparse=False,
@@ -478,6 +500,27 @@ def test_decode_emotions(monkeypatch):
     _test_features_list(data_id)
 
 
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_titanic(monkeypatch, gzip_response):
+    # check because of the string attributes
+    data_id = 40945
+    data_name = 'Titanic'
+    data_version = 1
+    ignore_strings = True
+    target_column = 'survived'
+    # Not all original features included because five are strings
+    expected_observations = 1309
+    expected_features = 8
+    expected_missing = 1454
+    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
+    _fetch_dataset_from_openml(data_id, data_name, data_version,
+                               ignore_strings, target_column,
+                               expected_observations, expected_features,
+                               expected_missing,
+                               np.float64, object, expect_sparse=False,
+                               compare_default_target=True)
+
+
 @pytest.mark.parametrize('gzip_response', [True, False])
 def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
     data_id = 61
@@ -667,7 +710,8 @@ def test_string_attribute(monkeypatch, gzip_response):
     # single column test
     assert_raise_message(ValueError,
                          'STRING attributes are not yet supported',
-                         fetch_openml, data_id=data_id, cache=False)
+                         fetch_openml, data_id=data_id, ignore_strings=False,
+                         cache=False)
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])