8
8
from functools import wraps
9
9
import itertools
10
10
from collections .abc import Generator
11
+ from collections import OrderedDict
11
12
12
13
from urllib .request import urlopen , Request
13
14
18
19
from .base import get_data_home
19
20
from urllib .error import HTTPError
20
21
from ..utils import Bunch
22
+ from ..utils import get_chunk_n_rows
23
+ from ..utils import _chunk_generator
24
+ from ..utils import check_pandas_support # noqa
21
25
22
26
__all__ = ['fetch_openml' ]
23
27
@@ -263,6 +267,69 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
263
267
raise ValueError ('Unexpected Data Type obtained from arff.' )
264
268
265
269
270
+ def _feature_to_dtype (feature ):
271
+ """Map feature to dtype for pandas DataFrame
272
+ """
273
+ if feature ['data_type' ] == 'string' :
274
+ return object
275
+ elif feature ['data_type' ] == 'nominal' :
276
+ return 'category'
277
+ # only numeric, integer, real are left
278
+ elif (feature ['number_of_missing_values' ] != '0' or
279
+ feature ['data_type' ] in ['numeric' , 'real' ]):
280
+ # cast to floats when there are any missing values
281
+ return np .float64
282
+ elif feature ['data_type' ] == 'integer' :
283
+ return np .int64
284
+ raise ValueError ('Unsupported feature: {}' .format (feature ))
285
+
286
+
287
+ def _convert_arff_data_dataframe (arrf , columns , features_dict ):
288
+ """Convert the ARFF object into a pandas DataFrame.
289
+
290
+ Parameters
291
+ ----------
292
+ arrf : dict
293
+ As obtained from liac-arff object.
294
+
295
+ columns : list
296
+ Columns from dataframe to return.
297
+
298
+ features_dict : dict
299
+ Maps feature name to feature info from openml.
300
+
301
+ Returns
302
+ -------
303
+ dataframe : pandas DataFrame
304
+ """
305
+ pd = check_pandas_support ('fetch_openml with as_frame=True' )
306
+
307
+ attributes = OrderedDict (arrf ['attributes' ])
308
+ arrf_columns = list (attributes )
309
+
310
+ # calculate chunksize
311
+ first_row = next (arrf ['data' ])
312
+ first_df = pd .DataFrame ([first_row ], columns = arrf_columns )
313
+
314
+ row_bytes = first_df .memory_usage (deep = True ).sum ()
315
+ chunksize = get_chunk_n_rows (row_bytes )
316
+
317
+ # read arrf data with chunks
318
+ columns_to_keep = [col for col in arrf_columns if col in columns ]
319
+ dfs = []
320
+ dfs .append (first_df [columns_to_keep ])
321
+ for data in _chunk_generator (arrf ['data' ], chunksize ):
322
+ dfs .append (pd .DataFrame (data , columns = arrf_columns )[columns_to_keep ])
323
+ df = pd .concat (dfs )
324
+
325
+ for column in columns_to_keep :
326
+ dtype = _feature_to_dtype (features_dict [column ])
327
+ if dtype == 'category' :
328
+ dtype = pd .api .types .CategoricalDtype (attributes [column ])
329
+ df [column ] = df [column ].astype (dtype , copy = False )
330
+ return df
331
+
332
+
266
333
def _get_data_info_by_name (name , version , data_home ):
267
334
"""
268
335
Utilizes the openml dataset listing api to find a dataset by
@@ -436,7 +503,8 @@ def _valid_data_column_names(features_list, target_columns):
436
503
437
504
438
505
def fetch_openml (name = None , version = 'active' , data_id = None , data_home = None ,
439
- target_column = 'default-target' , cache = True , return_X_y = False ):
506
+ target_column = 'default-target' , cache = True , return_X_y = False ,
507
+ as_frame = False ):
440
508
"""Fetch dataset from openml by name or dataset id.
441
509
442
510
Datasets are uniquely identified by either an integer ID or by a
@@ -489,26 +557,39 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
489
557
If True, returns ``(data, target)`` instead of a Bunch object. See
490
558
below for more information about the `data` and `target` objects.
491
559
560
+ as_frame : boolean, default=False
561
+ If True, the data is a pandas DataFrame including columns with
562
+ appropriate dtypes (numeric, string or categorical). The target is
563
+ a pandas DataFrame or Series depending on the number of target_columns.
564
+ The Bunch will contain a ``frame`` attribute with the target and the
565
+ data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas
566
+ DataFrames or Series as describe above.
567
+
492
568
Returns
493
569
-------
494
570
495
571
data : Bunch
496
572
Dictionary-like object, with attributes:
497
573
498
- data : np.array or scipy.sparse.csr_matrix of floats
574
+ data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame
499
575
The feature matrix. Categorical features are encoded as ordinals.
500
- target : np.array
576
+ target : np.array, pandas Series or DataFrame
501
577
The regression target or classification labels, if applicable.
502
- Dtype is float if numeric, and object if categorical.
578
+ Dtype is float if numeric, and object if categorical. If
579
+ ``as_frame`` is True, ``target`` is a pandas object.
503
580
DESCR : str
504
581
The full description of the dataset
505
582
feature_names : list
506
583
The names of the dataset columns
507
- categories : dict
584
+ categories : dict or None
508
585
Maps each categorical feature name to a list of values, such
509
- that the value encoded as i is ith in the list.
586
+ that the value encoded as i is ith in the list. If ``as_frame``
587
+ is True, this is None.
510
588
details : dict
511
589
More metadata from OpenML
590
+ frame : pandas DataFrame
591
+ Only present when `as_frame=True`. DataFrame with ``data`` and
592
+ ``target``.
512
593
513
594
(data, target) : tuple if ``return_X_y`` is True
514
595
@@ -568,41 +649,52 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
568
649
warn ("OpenML raised a warning on the dataset. It might be "
569
650
"unusable. Warning: {}" .format (data_description ['warning' ]))
570
651
652
+ return_sparse = False
653
+ if data_description ['format' ].lower () == 'sparse_arff' :
654
+ return_sparse = True
655
+
656
+ if as_frame and return_sparse :
657
+ raise ValueError ('Cannot return dataframe with sparse data' )
658
+
571
659
# download data features, meta-info about column types
572
660
features_list = _get_data_features (data_id , data_home )
573
661
574
- for feature in features_list :
575
- if 'true' in (feature ['is_ignore' ], feature ['is_row_identifier' ]):
576
- continue
577
- if feature ['data_type' ] == 'string' :
578
- raise ValueError ('STRING attributes are not yet supported' )
662
+ if not as_frame :
663
+ for feature in features_list :
664
+ if 'true' in (feature ['is_ignore' ], feature ['is_row_identifier' ]):
665
+ continue
666
+ if feature ['data_type' ] == 'string' :
667
+ raise ValueError ('STRING attributes are not supported for '
668
+ 'array representation. Try as_frame=True' )
579
669
580
670
if target_column == "default-target" :
581
671
# determines the default target based on the data feature results
582
672
# (which is currently more reliable than the data description;
583
673
# see issue: https://github.com/openml/OpenML/issues/768)
584
- target_column = [feature ['name' ] for feature in features_list
585
- if feature ['is_target' ] == 'true' ]
674
+ target_columns = [feature ['name' ] for feature in features_list
675
+ if feature ['is_target' ] == 'true' ]
586
676
elif isinstance (target_column , str ):
587
677
# for code-simplicity, make target_column by default a list
588
- target_column = [target_column ]
678
+ target_columns = [target_column ]
589
679
elif target_column is None :
590
- target_column = []
591
- elif not isinstance (target_column , list ):
680
+ target_columns = []
681
+ elif isinstance (target_column , list ):
682
+ target_columns = target_column
683
+ else :
592
684
raise TypeError ("Did not recognize type of target_column"
593
685
"Should be str, list or None. Got: "
594
686
"{}" .format (type (target_column )))
595
687
data_columns = _valid_data_column_names (features_list ,
596
- target_column )
688
+ target_columns )
597
689
598
690
# prepare which columns and data types should be returned for the X and y
599
691
features_dict = {feature ['name' ]: feature for feature in features_list }
600
692
601
693
# XXX: col_slice_y should be all nominal or all numeric
602
- _verify_target_data_type (features_dict , target_column )
694
+ _verify_target_data_type (features_dict , target_columns )
603
695
604
696
col_slice_y = [int (features_dict [col_name ]['index' ])
605
- for col_name in target_column ]
697
+ for col_name in target_columns ]
606
698
607
699
col_slice_x = [int (features_dict [col_name ]['index' ])
608
700
for col_name in data_columns ]
@@ -615,10 +707,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
615
707
'columns. ' .format (feat ['name' ], nr_missing ))
616
708
617
709
# determine arff encoding to return
618
- return_sparse = False
619
- if data_description ['format' ].lower () == 'sparse_arff' :
620
- return_sparse = True
621
-
622
710
if not return_sparse :
623
711
data_qualities = _get_data_qualities (data_id , data_home )
624
712
shape = _get_data_shape (data_qualities )
@@ -631,46 +719,62 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
631
719
632
720
# obtain the data
633
721
arff = _download_data_arff (data_description ['file_id' ], return_sparse ,
634
- data_home )
635
-
636
- # nominal attributes is a dict mapping from the attribute name to the
637
- # possible values. Includes also the target column (which will be popped
638
- # off below, before it will be packed in the Bunch object)
639
- nominal_attributes = {k : v for k , v in arff ['attributes' ]
640
- if isinstance (v , list ) and
641
- k in data_columns + target_column }
642
-
643
- X , y = _convert_arff_data (arff ['data' ], col_slice_x , col_slice_y , shape )
644
-
645
- is_classification = {col_name in nominal_attributes
646
- for col_name in target_column }
647
- if not is_classification :
648
- # No target
649
- pass
650
- elif all (is_classification ):
651
- y = np .hstack ([np .take (np .asarray (nominal_attributes .pop (col_name ),
652
- dtype = 'O' ),
653
- y [:, i :i + 1 ].astype (int , copy = False ))
654
- for i , col_name in enumerate (target_column )])
655
- elif any (is_classification ):
656
- raise ValueError ('Mix of nominal and non-nominal targets is not '
657
- 'currently supported' )
722
+ data_home , encode_nominal = not as_frame )
658
723
659
724
description = "{}\n \n Downloaded from openml.org." .format (
660
725
data_description .pop ('description' ))
661
726
662
- # reshape y back to 1-D array, if there is only 1 target column; back
663
- # to None if there are not target columns
664
- if y .shape [1 ] == 1 :
665
- y = y .reshape ((- 1 ,))
666
- elif y .shape [1 ] == 0 :
667
- y = None
727
+ nominal_attributes = None
728
+ frame = None
729
+ if as_frame :
730
+ columns = data_columns + target_columns
731
+ frame = _convert_arff_data_dataframe (arff , columns , features_dict )
732
+ X = frame [data_columns ]
733
+ if len (target_columns ) >= 2 :
734
+ y = frame [target_columns ]
735
+ elif len (target_columns ) == 1 :
736
+ y = frame [target_columns [0 ]]
737
+ else :
738
+ y = None
739
+ else :
740
+ # nominal attributes is a dict mapping from the attribute name to the
741
+ # possible values. Includes also the target column (which will be
742
+ # popped off below, before it will be packed in the Bunch object)
743
+ nominal_attributes = {k : v for k , v in arff ['attributes' ]
744
+ if isinstance (v , list ) and
745
+ k in data_columns + target_columns }
746
+
747
+ X , y = _convert_arff_data (arff ['data' ], col_slice_x ,
748
+ col_slice_y , shape )
749
+
750
+ is_classification = {col_name in nominal_attributes
751
+ for col_name in target_columns }
752
+ if not is_classification :
753
+ # No target
754
+ pass
755
+ elif all (is_classification ):
756
+ y = np .hstack ([
757
+ np .take (
758
+ np .asarray (nominal_attributes .pop (col_name ), dtype = 'O' ),
759
+ y [:, i :i + 1 ].astype (int , copy = False ))
760
+ for i , col_name in enumerate (target_columns )
761
+ ])
762
+ elif any (is_classification ):
763
+ raise ValueError ('Mix of nominal and non-nominal targets is not '
764
+ 'currently supported' )
765
+
766
+ # reshape y back to 1-D array, if there is only 1 target column; back
767
+ # to None if there are not target columns
768
+ if y .shape [1 ] == 1 :
769
+ y = y .reshape ((- 1 ,))
770
+ elif y .shape [1 ] == 0 :
771
+ y = None
668
772
669
773
if return_X_y :
670
774
return X , y
671
775
672
776
bunch = Bunch (
673
- data = X , target = y , feature_names = data_columns ,
777
+ data = X , target = y , frame = frame , feature_names = data_columns ,
674
778
DESCR = description , details = data_description ,
675
779
categories = nominal_attributes ,
676
780
url = "https://www.openml.org/d/{}" .format (data_id ))
0 commit comments