6
6
from warnings import warn
7
7
from contextlib import closing
8
8
from functools import wraps
9
+ import itertools
10
+ from collections .abc import Generator
9
11
10
12
from urllib .request import urlopen , Request
11
13
23
25
_SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2"
24
26
_DATA_INFO = "api/v1/json/data/{}"
25
27
_DATA_FEATURES = "api/v1/json/data/features/{}"
28
+ _DATA_QUALITIES = "api/v1/json/data/qualities/{}"
26
29
_DATA_FILE = "data/v1/download/{}"
27
30
28
31
@@ -210,7 +213,7 @@ def _sparse_data_to_array(arff_data, include_columns):
210
213
return y
211
214
212
215
213
- def _convert_arff_data (arff_data , col_slice_x , col_slice_y ):
216
+ def _convert_arff_data (arff_data , col_slice_x , col_slice_y , shape = None ):
214
217
"""
215
218
converts the arff object into the appropriate matrix type (np.array or
216
219
scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
@@ -234,10 +237,16 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y):
234
237
X : np.array or scipy.sparse.csr_matrix
235
238
y : np.array
236
239
"""
237
- if isinstance (arff_data , list ):
238
- data = np .array (arff_data , dtype = np .float64 )
239
- X = np .array (data [:, col_slice_x ], dtype = np .float64 )
240
- y = np .array (data [:, col_slice_y ], dtype = np .float64 )
240
+ if isinstance (arff_data , Generator ):
241
+ if shape [0 ] == - 1 :
242
+ count = - 1
243
+ else :
244
+ count = shape [0 ] * shape [1 ]
245
+ data = np .fromiter (itertools .chain .from_iterable (arff_data ),
246
+ dtype = 'float64' , count = count )
247
+ data = data .reshape (* shape )
248
+ X = data [:, col_slice_x ]
249
+ y = data [:, col_slice_y ]
241
250
return X , y
242
251
elif isinstance (arff_data , tuple ):
243
252
arff_data_X = _split_sparse_columns (arff_data , col_slice_x )
@@ -333,6 +342,34 @@ def _get_data_features(data_id, data_home):
333
342
return json_data ['data_features' ]['feature' ]
334
343
335
344
345
+ def _get_data_qualities (data_id , data_home ):
346
+ # OpenML API function:
347
+ # https://www.openml.org/api_docs#!/data/get_data_qualities_id
348
+ url = _DATA_QUALITIES .format (data_id )
349
+ error_message = "Dataset with data_id {} not found." .format (data_id )
350
+ json_data = _get_json_content_from_openml_api (url , error_message , True ,
351
+ data_home )
352
+ try :
353
+ return json_data ['data_qualities' ]['quality' ]
354
+ except KeyError :
355
+ # the qualities might not be available, but we still try to process
356
+ # the data
357
+ return None
358
+
359
+
360
+ def _get_data_shape (data_qualities ):
361
+ # Using the data_info dictionary from _get_data_info_by_name to extract
362
+ # the number of samples / features
363
+ if data_qualities is None :
364
+ return None
365
+ qualities = {d ['name' ]: d ['value' ] for d in data_qualities }
366
+ try :
367
+ return (int (float (qualities ['NumberOfInstances' ])),
368
+ int (float (qualities ['NumberOfFeatures' ])))
369
+ except AttributeError :
370
+ return None
371
+
372
+
336
373
def _download_data_arff (file_id , sparse , data_home , encode_nominal = True ):
337
374
# Accesses an ARFF file on the OpenML server. Documentation:
338
375
# https://www.openml.org/api_data_docs#!/data/get_download_id
@@ -346,7 +383,7 @@ def _arff_load():
346
383
if sparse is True :
347
384
return_type = _arff .COO
348
385
else :
349
- return_type = _arff .DENSE
386
+ return_type = _arff .DENSE_GEN
350
387
351
388
arff_file = _arff .loads (response .read ().decode ('utf-8' ),
352
389
encode_nominal = encode_nominal ,
@@ -508,7 +545,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
508
545
data_id = data_info ['did' ]
509
546
elif data_id is not None :
510
547
# from the previous if statement, it is given that name is None
511
- if version is not "active" :
548
+ if version != "active" :
512
549
raise ValueError (
513
550
"Dataset data_id={} and version={} passed, but you can only "
514
551
"specify a numeric data_id or a version, not "
@@ -584,18 +621,28 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
584
621
if data_description ['format' ].lower () == 'sparse_arff' :
585
622
return_sparse = True
586
623
624
+ if not return_sparse :
625
+ data_qualities = _get_data_qualities (data_id , data_home )
626
+ shape = _get_data_shape (data_qualities )
627
+ # if the data qualities were not available, we can still get the
628
+ # n_features from the feature list, with the n_samples unknown
629
+ if shape is None :
630
+ shape = (- 1 , len (features_list ))
631
+ else :
632
+ shape = None
633
+
587
634
# obtain the data
588
635
arff = _download_data_arff (data_description ['file_id' ], return_sparse ,
589
636
data_home )
590
- arff_data = arff [ 'data' ]
637
+
591
638
# nominal attributes is a dict mapping from the attribute name to the
592
639
# possible values. Includes also the target column (which will be popped
593
640
# off below, before it will be packed in the Bunch object)
594
641
nominal_attributes = {k : v for k , v in arff ['attributes' ]
595
642
if isinstance (v , list ) and
596
643
k in data_columns + target_column }
597
644
598
- X , y = _convert_arff_data (arff_data , col_slice_x , col_slice_y )
645
+ X , y = _convert_arff_data (arff [ 'data' ] , col_slice_x , col_slice_y , shape )
599
646
600
647
is_classification = {col_name in nominal_attributes
601
648
for col_name in target_column }
0 commit comments