6
6
from warnings import warn
7
7
from contextlib import closing
8
8
from functools import wraps
9
- import itertools
10
- from collections .abc import Generator
11
9
12
10
from urllib .request import urlopen , Request
13
11
25
23
_SEARCH_NAME = "api/v1/json/data/list/data_name/{}/limit/2"
26
24
_DATA_INFO = "api/v1/json/data/{}"
27
25
_DATA_FEATURES = "api/v1/json/data/features/{}"
28
- _DATA_QUALITIES = "api/v1/json/data/qualities/{}"
29
26
_DATA_FILE = "data/v1/download/{}"
30
27
31
28
@@ -213,7 +210,7 @@ def _sparse_data_to_array(arff_data, include_columns):
213
210
return y
214
211
215
212
216
- def _convert_arff_data (
8000
arff_data , col_slice_x , col_slice_y , shape = None ):
213
+ def _convert_arff_data (arff_data , col_slice_x , col_slice_y ):
217
214
"""
218
215
converts the arff object into the appropriate matrix type (np.array or
219
216
scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
@@ -237,16 +234,10 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
237
234
X : np.array or scipy.sparse.csr_matrix
238
235
y : np.array
239
236
"""
240
- if isinstance (arff_data , Generator ):
241
- if shape [0 ] == - 1 :
242
- count = - 1
243
- else :
244
- count = shape [0 ] * shape [1 ]
245
- data = np .fromiter (itertools .chain .from_iterable (arff_data ),
246
- dtype = 'float64' , count = count )
247
- data = data .reshape (* shape )
248
- X = data [:, col_slice_x ]
249
- y = data [:, col_slice_y ]
237
+ if isinstance (arff_data , list ):
238
+ data = np .array (arff_data , dtype = np .float64 )
239
+ X = np .array (data [:, col_slice_x ], dtype = np .float64 )
240
+ y = np .array (data [:, col_slice_y ], dtype = np .float64 )
250
241
return X , y
251
242
elif isinstance (arff_data , tuple ):
252
243
arff_data_X = _split_sparse_columns (arff_data , col_slice_x )
@@ -342,34 +333,6 @@ def _get_data_features(data_id, data_home):
342
333
return json_data ['data_features' ]['feature' ]
343
334
344
335
345
- def _get_data_qualities (data_id , data_home ):
346
- # OpenML API function:
347
- # https://www.openml.org/api_docs#!/data/get_data_qualities_id
348
- url = _DATA_QUALITIES .format (data_id )
349
- error_message = "Dataset with data_id {} not found." .format (data_id )
350
- json_data = _get_json_content_from_openml_api (url , error_message , True ,
351
- data_home )
352
- try :
353
- return json_data ['data_qualities' ]['quality' ]
354
- except KeyError :
355
- # the qualities might not be available, but we still try to process
356
- # the data
357
- return None
358
-
359
-
360
- def _get_data_shape (data_qualities ):
361
- # Using the data_info dictionary from _get_data_info_by_name to extract
362
- # the number of samples / features
363
- if data_qualities is None :
364
- return None
365
- qualities = {d ['name' ]: d ['value' ] for d in data_qualities }
366
- try :
367
- return (int (float (qualities ['NumberOfInstances' ])),
368
- int (float (qualities ['NumberOfFeatures' ])))
369
- except AttributeError :
370
- return None
371
-
372
-
373
336
def _download_data_arff (file_id , sparse , data_home , encode_nominal = True ):
374
337
# Accesses an ARFF file on the OpenML server. Documentation:
375
338
# https://www.openml.org/api_data_docs#!/data/get_download_id
@@ -383,7 +346,7 @@ def _arff_load():
383
346
if sparse is True :
384
347
return_type = _arff .COO
385
348
else :
386
- return_type = _arff .DENSE_GEN
349
+ return_type = _arff .DENSE
387
350
388
351
arff_file = _arff .loads (response .read ().decode ('utf-8' ),
389
352
encode_nominal = encode_nominal ,
@@ -545,7 +508,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
545
508
data_id = data_info ['did' ]
546
509
elif data_id is not None :
547
510
# from the previous if statement, it is given that name is None
548
- if version != "active" :
511
+ if version is not "active" :
549
512
raise ValueError (
550
513
"Dataset data_id={} and version={} passed, but you can only "
551
514
"specify a numeric data_id or a version, not "
@@ -621,28 +584,18 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
621
584
if data_description ['format' ].lower () == 'sparse_arff' :
622
585
return_sparse = True
623
586
624
- if not return_sparse :
625
- data_qualities = _get_data_qualities (data_id , data_home )
626
- shape = _get_data_shape (data_qualities )
627
- # if the data qualities were not available, we can still get the
628
- # n_features from the feature list, with the n_samples unknown
629
- if shape is None :
630
- shape = (- 1 , len (features_list ))
631
- else :
632
- shape = None
633
-
634
587
# obtain the data
635
588
arff = _download_data_arff (data_description ['file_id' ], return_sparse ,
636
589
data_home )
637
-
590
+ arff_data = arff [ 'data' ]
638
591
# nominal attributes is a dict mapping from the attribute name to the
639
592
# possible values. Includes also the target column (which will be popped
640
593
# off below, before it will be packed in the Bunch object)
641
594
nominal_attributes = {k : v for k , v in arff ['attributes' ]
642
595
if isinstance (v , list ) and
643
596
k in data_columns + target_column }
644
597
645
- X , y = _convert_arff_data (arff [ 'data' ] , col_slice_x , col_slice_y , shape )
598
+ X , y = _convert_arff_data (arff_data , col_slice_x , col_slice_y )
646
599
647
600
is_classification = {col_name in nominal_attributes
648
601
for col_name in target_column }
0 commit comments