xhluca
diff --git a/‎sklearn/datasets/openml.py
Lines changed: 56 additions & 9 deletions b/‎sklearn/datasets/openml.py
Lines changed: 56 additions & 9 deletions
diff --git a/‎sklearn/datasets/tests/data/openml/1/api-v1-json-data-qualities-1.json.gz
145 Bytes b/‎sklearn/datasets/tests/data/openml/1/api-v1-json-data-qualities-1.json.gz
145 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/1119/api-v1-json-data-qualities-1119.json.gz
1.51 KB b/‎sklearn/datasets/tests/data/openml/1119/api-v1-json-data-qualities-1119.json.gz
1.51 KB
diff --git a/‎sklearn/datasets/tests/data/openml/2/api-v1-json-data-qualities-2.json.gz
1.47 KB b/‎sklearn/datasets/tests/data/openml/2/api-v1-json-data-qualities-2.json.gz
1.47 KB
diff --git a/‎sklearn/datasets/tests/data/openml/3/api-v1-json-data-qualities-3.json.gz
1.37 KB b/‎sklearn/datasets/tests/data/openml/3/api-v1-json-data-qualities-3.json.gz
1.37 KB
diff --git a/‎sklearn/datasets/tests/data/openml/40589/api-v1-json-data-qualities-40589.json.gz
913 Bytes b/‎sklearn/datasets/tests/data/openml/40589/api-v1-json-data-qualities-40589.json.gz
913 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40675/api-v1-json-data-qualities-40675.json.gz
886 Bytes b/‎sklearn/datasets/tests/data/openml/40675/api-v1-json-data-qualities-40675.json.gz
886 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40966/api-v1-json-data-qualities-40966.json.gz
934 Bytes b/‎sklearn/datasets/tests/data/openml/40966/api-v1-json-data-qualities-40966.json.gz
934 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/561/api-v1-json-data-qualities-561.json.gz
1.05 KB b/‎sklearn/datasets/tests/data/openml/561/api-v1-json-data-qualities-561.json.gz
1.05 KB
diff --git a/‎sklearn/datasets/tests/data/openml/61/api-v1-json-data-qualities-61.json.gz
1.09 KB b/‎sklearn/datasets/tests/data/openml/61/api-v1-json-data-qualities-61.json.gz
1.09 KB
@@ -6,6 +6,8 @@
 from warnings import warn
 from contextlib import closing
 from functools import wraps
+import itertools
+from collections.abc import Generator
 
 from urllib.request import urlopen, Request
 
@@ -23,6 +25,7 @@
 _SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2"
 _DATA_INFO = "api/v1/json/data/{}"
 _DATA_FEATURES = "api/v1/json/data/features/{}"
+_DATA_QUALITIES = "api/v1/json/data/qualities/{}"
 _DATA_FILE = "data/v1/download/{}"
 
 
@@ -210,7 +213,7 @@ def _sparse_data_to_array(arff_data, include_columns):
     return y
 
 
-def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
+def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
     """
     converts the arff object into the appropriate matrix type (np.array or
     scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
@@ -234,10 +237,16 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
     X : np.array or scipy.sparse.csr_matrix
     y : np.array
     """
-    if isinstance(arff_data, list):
-        data = np.array(arff_data, dtype=np.float64)
-        X = np.array(data[:, col_slice_x], dtype=np.float64)
-        y = np.array(data[:, col_slice_y], dtype=np.float64)
+    if isinstance(arff_data, Generator):
+        if shape[0] == -1:
+            count = -1
+        else:
+            count = shape[0] * shape[1]
+        data = np.fromiter(itertools.chain.from_iterable(arff_data),
+                           dtype='float64', count=count)
+        data = data.reshape(*shape)
+        X = data[:, col_slice_x]
+        y = data[:, col_slice_y]
         return X, y
     elif isinstance(arff_data, tuple):
         arff_data_X = _split_sparse_columns(arff_data, col_slice_x)
@@ -333,6 +342,34 @@ def _get_data_features(data_id, data_home):
     return json_data['data_features']['feature']
 
 
+def _get_data_qualities(data_id, data_home):
+    # OpenML API function:
+    # https://www.openml.org/api_docs#!/data/get_data_qualities_id
+    url = _DATA_QUALITIES.format(data_id)
+    error_message = "Dataset with data_id {} not found.".format(data_id)
+    json_data = _get_json_content_from_openml_api(url, error_message, True,
+                                                  data_home)
+    try:
+        return json_data['data_qualities']['quality']
+    except KeyError:
+        # the qualities might not be available, but we still try to process
+        # the data
+        return None
+
+
+def _get_data_shape(data_qualities):
+    # Using the data_info dictionary from _get_data_info_by_name to extract
+    # the number of samples / features
+    if data_qualities is None:
+        return None
+    qualities = {d['name']: d['value'] for d in data_qualities}
+    try:
+        return (int(float(qualities['NumberOfInstances'])),
+                int(float(qualities['NumberOfFeatures'])))
+    except AttributeError:
+        return None
+
+
 def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
     # Accesses an ARFF file on the OpenML server. Documentation:
     # https://www.openml.org/api_data_docs#!/data/get_download_id
@@ -346,7 +383,7 @@ def _arff_load():
             if sparse is True:
                 return_type = _arff.COO
             else:
-                return_type = _arff.DENSE
+                return_type = _arff.DENSE_GEN
 
             arff_file = _arff.loads(response.read().decode('utf-8'),
                                     encode_nominal=encode_nominal,
@@ -508,7 +545,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
         data_id = data_info['did']
     elif data_id is not None:
         # from the previous if statement, it is given that name is None
-        if version is not "active":
+        if version != "active":
             raise ValueError(
                 "Dataset data_id={} and version={} passed, but you can only "
                 "specify a numeric data_id or a version, not "
@@ -584,18 +621,28 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
     if data_description['format'].lower() == 'sparse_arff':
         return_sparse = True
 
+    if not return_sparse:
+        data_qualities = _get_data_qualities(data_id, data_home)
+        shape = _get_data_shape(data_qualities)
+        # if the data qualities were not available, we can still get the
+        # n_features from the feature list, with the n_samples unknown
+        if shape is None:
+            shape = (-1, len(features_list))
+    else:
+        shape = None
+
     # obtain the data
     arff = _download_data_arff(data_description['file_id'], return_sparse,
                                data_home)
-    arff_data = arff['data']
+
     # nominal attributes is a dict mapping from the attribute name to the
     # possible values. Includes also the target column (which will be popped
     # off below, before it will be packed in the Bunch object)
     nominal_attributes = {k: v for k, v in arff['attributes']
                           if isinstance(v, list) and
                           k in data_columns + target_column}
 
-    X, y = _convert_arff_data(arff_data, col_slice_x, col_slice_y)
+    X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape)
 
     is_classification = {col_name in nominal_attributes
                          for col_name in target_column}