xhluca
diff --git a/‎sklearn/datasets/openml.py
Lines changed: 9 additions & 56 deletions b/‎sklearn/datasets/openml.py
Lines changed: 9 additions & 56 deletions
diff --git a/‎sklearn/datasets/tests/data/openml/1/api-v1-json-data-qualities-1.json.gz
-145 Bytes b/‎sklearn/datasets/tests/data/openml/1/api-v1-json-data-qualities-1.json.gz
-145 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/1119/api-v1-json-data-qualities-1119.json.gz
-1.51 KB b/‎sklearn/datasets/tests/data/openml/1119/api-v1-json-data-qualities-1119.json.gz
-1.51 KB
diff --git a/‎sklearn/datasets/tests/data/openml/2/api-v1-json-data-qualities-2.json.gz
-1.47 KB b/‎sklearn/datasets/tests/data/openml/2/api-v1-json-data-qualities-2.json.gz
-1.47 KB
diff --git a/‎sklearn/datasets/tests/data/openml/3/api-v1-json-data-qualities-3.json.gz
-1.37 KB b/‎sklearn/datasets/tests/data/openml/3/api-v1-json-data-qualities-3.json.gz
-1.37 KB
diff --git a/‎sklearn/datasets/tests/data/openml/40589/api-v1-json-data-qualities-40589.json.gz
-913 Bytes b/‎sklearn/datasets/tests/data/openml/40589/api-v1-json-data-qualities-40589.json.gz
-913 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40675/api-v1-json-data-qualities-40675.json.gz
-886 Bytes b/‎sklearn/datasets/tests/data/openml/40675/api-v1-json-data-qualities-40675.json.gz
-886 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/40966/api-v1-json-data-qualities-40966.json.gz
-934 Bytes b/‎sklearn/datasets/tests/data/openml/40966/api-v1-json-data-qualities-40966.json.gz
-934 Bytes
diff --git a/‎sklearn/datasets/tests/data/openml/561/api-v1-json-data-qualities-561.json.gz
-1.05 KB b/‎sklearn/datasets/tests/data/openml/561/api-v1-json-data-qualities-561.json.gz
-1.05 KB
diff --git a/‎sklearn/datasets/tests/data/openml/61/api-v1-json-data-qualities-61.json.gz
-1.09 KB b/‎sklearn/datasets/tests/data/openml/61/api-v1-json-data-qualities-61.json.gz
-1.09 KB
@@ -6,8 +6,6 @@
 from warnings import warn
 from contextlib import closing
 from functools import wraps
-import itertools
-from collections.abc import Generator
 
 from urllib.request import urlopen, Request
 
@@ -25,7 +23,6 @@
 _SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2"
 _DATA_INFO = "api/v1/json/data/{}"
 _DATA_FEATURES = "api/v1/json/data/features/{}"
-_DATA_QUALITIES = "api/v1/json/data/qualities/{}"
 _DATA_FILE = "data/v1/download/{}"
 
 
@@ -213,7 +210,7 @@ def _sparse_data_to_array(arff_data, include_columns):
     return y
 
 
-def _convert_arff_data(
8000
arff_data, col_slice_x, col_slice_y, shape=None):
+def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
     """
     converts the arff object into the appropriate matrix type (np.array or
     scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
@@ -237,16 +234,10 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
     X : np.array or scipy.sparse.csr_matrix
     y : np.array
     """
-    if isinstance(arff_data, Generator):
-        if shape[0] == -1:
-            count = -1
-        else:
-            count = shape[0] * shape[1]
-        data = np.fromiter(itertools.chain.from_iterable(arff_data),
-                           dtype='float64', count=count)
-        data = data.reshape(*shape)
-        X = data[:, col_slice_x]
-        y = data[:, col_slice_y]
+    if isinstance(arff_data, list):
+        data = np.array(arff_data, dtype=np.float64)
+        X = np.array(data[:, col_slice_x], dtype=np.float64)
+        y = np.array(data[:, col_slice_y], dtype=np.float64)
         return X, y
     elif isinstance(arff_data, tuple):
         arff_data_X = _split_sparse_columns(arff_data, col_slice_x)
@@ -342,34 +333,6 @@ def _get_data_features(data_id, data_home):
     return json_data['data_features']['feature']
 
 
-def _get_data_qualities(data_id, data_home):
-    # OpenML API function:
-    # https://www.openml.org/api_docs#!/data/get_data_qualities_id
-    url = _DATA_QUALITIES.format(data_id)
-    error_message = "Dataset with data_id {} not found.".format(data_id)
-    json_data = _get_json_content_from_openml_api(url, error_message, True,
-                                                  data_home)
-    try:
-        return json_data['data_qualities']['quality']
-    except KeyError:
-        # the qualities might not be available, but we still try to process
-        # the data
-        return None
-
-
-def _get_data_shape(data_qualities):
-    # Using the data_info dictionary from _get_data_info_by_name to extract
-    # the number of samples / features
-    if data_qualities is None:
-        return None
-    qualities = {d['name']: d['value'] for d in data_qualities}
-    try:
-        return (int(float(qualities['NumberOfInstances'])),
-                int(float(qualities['NumberOfFeatures'])))
-    except AttributeError:
-        return None
-
-
 def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
     # Accesses an ARFF file on the OpenML server. Documentation:
     # https://www.openml.org/api_data_docs#!/data/get_download_id
@@ -383,7 +346,7 @@ def _arff_load():
             if sparse is True:
                 return_type = _arff.COO
             else:
-                return_type = _arff.DENSE_GEN
+                return_type = _arff.DENSE
 
             arff_file = _arff.loads(response.read().decode('utf-8'),
                                     encode_nominal=encode_nominal,
@@ -545,7 +508,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
         data_id = data_info['did']
     elif data_id is not None:
         # from the previous if statement, it is given that name is None
-        if version != "active":
+        if version is not "active":
             raise ValueError(
                 "Dataset data_id={} and version={} passed, but you can only "
                 "specify a numeric data_id or a version, not "
@@ -621,28 +584,18 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
     if data_description['format'].lower() == 'sparse_arff':
         return_sparse = True
 
-    if not return_sparse:
-        data_qualities = _get_data_qualities(data_id, data_home)
-        shape = _get_data_shape(data_qualities)
-        # if the data qualities were not available, we can still get the
-        # n_features from the feature list, with the n_samples unknown
-        if shape is None:
-            shape = (-1, len(features_list))
-    else:
-        shape = None
-
     # obtain the data
     arff = _download_data_arff(data_description['file_id'], return_sparse,
                                data_home)
-
+    arff_data = arff['data']
     # nominal attributes is a dict mapping from the attribute name to the
     # possible values. Includes also the target column (which will be popped
     # off below, before it will be packed in the Bunch object)
     nominal_attributes = {k: v for k, v in arff['attributes']
                           if isinstance(v, list) and
                           k in data_columns + target_column}
 
-    X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape)
+    X, y = _convert_arff_data(arff_data, col_slice_x, col_slice_y)
 
     is_classification = {col_name in nominal_attributes
                          for col_name in target_column}