8000 EHN Add pandas dataframe support to fetch_openml (#13902) · scikit-learn/scikit-learn@cf3e303 · GitHub
[go: up one dir, main page]

Skip to content

Commit cf3e303

Browse files
thomasjpfanglemaitre
authored andcommitted
EHN Add pandas dataframe support to fetch_openml (#13902)
1 parent da66111 commit cf3e303

File tree

9 files changed

+635
-65
lines changed

9 files changed

+635
-65
lines changed

.circleci/config.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ jobs:
1212
- PYTHON_VERSION: 3.5
1313
- NUMPY_VERSION: 1.11.0
1414
- SCIPY_VERSION: 0.17.0
15-
- PANDAS_VERSION: 0.18.0
1615
- MATPLOTLIB_VERSION: 1.5.1
1716
- SCIKIT_IMAGE_VERSION: 0.12.3
1817
steps:

azure-pipelines.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ jobs:
2020
INSTALL_MKL: 'false'
2121
NUMPY_VERSION: '1.11.0'
2222
SCIPY_VERSION: '0.17.0'
23+
PANDAS_VERSION: '*'
2324
CYTHON_VERSION: '*'
2425
PILLOW_VERSION: '4.0.0'
2526
MATPLOTLIB_VERSION: '1.5.1'

doc/whats_new/v0.22.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ Changelog
4343
:pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
4444
where 123456 is the *pull request* number, not the issue number.
4545
46+
:mod:`sklearn.datasets`
47+
.......................
48+
49+
- |Feature| :func:`datasets.fetch_openml` now supports heterogeneous data using pandas
50+
by setting `as_frame=True`. :pr:`13902` by `Thomas Fan`_.
51+
4652
:mod:`sklearn.decomposition`
4753
............................
4854

examples/compose/plot_column_transformer_mixed_types.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,10 @@
2424
#
2525
# License: BSD 3 clause
2626

27-
import pandas as pd
2827
import numpy as np
2928

3029
from sklearn.compose import ColumnTransformer
30+
from sklearn.datasets import fetch_openml
3131
from sklearn.pipeline import Pipeline
3232
from sklearn.impute import SimpleImputer
3333
from sklearn.preprocessing import StandardScaler, OneHotEncoder
@@ -37,9 +37,13 @@
3737
np.random.seed(0)
3838

3939
# Read data from Titanic dataset.
40-
titanic_url = ('https://raw.githubusercontent.com/amueller/'
41-
'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
42-
data = pd.read_csv(titanic_url)
40+
titantic = fetch_openml(data_id=40945, as_frame=True)
41+
X = titantic.data
42+
y = titantic.target
43+
44+
# Alternatively X and y can be obtained directly from the frame attribute:
45+
# X = titantic.frame.drop('survived', axis=1)
46+
# y = titantic.frame['survived']
4347

4448
# We will train our classifier with the following features:
4549
# Numeric Features:
@@ -71,9 +75,6 @@
7175
clf = Pipeline(steps=[('preprocessor', preprocessor),
7276
('classifier', LogisticRegression())])
7377

74-
X = data.drop('survived', axis=1)
75-
y = data['survived']
76-
7778
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
7879

7980
clf.fit(X_train, y_train)

sklearn/datasets/openml.py

Lines changed: 158 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from functools import wraps
99
import itertools
1010
from collections.abc import Generator
11+
from collections import OrderedDict
1112

1213
from urllib.request import urlopen, Request
1314

@@ -18,6 +19,9 @@
1819
from .base import get_data_home
1920
from urllib.error import HTTPError
2021
from ..utils import Bunch
22+
from ..utils import get_chunk_n_rows
23+
from ..utils import _chunk_generator
24+
from ..utils import check_pandas_support # noqa
2125

2226
__all__ = ['fetch_openml']
2327

@@ -263,6 +267,69 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
263267
raise ValueError('Unexpected Data Type obtained from arff.')
264268

265269

270+
def _feature_to_dtype(feature):
271+
"""Map feature to dtype for pandas DataFrame
272+
"""
273+
if feature['data_type'] == 'string':
274+
return object
275+
elif feature['data_type'] == 'nominal':
276+
return 'category'
277+
# only numeric, integer, real are left
278+
elif (feature['number_of_missing_values'] != '0' or
279+
feature['data_type'] in ['numeric', 'real']):
280+
# cast to floats when there are any missing values
281+
return np.float64
282+
elif feature['data_type'] == 'integer':
283+
return np.int64
284+
raise ValueError('Unsupported feature: {}'.format(feature))
285+
286+
287+
def _convert_arff_data_dataframe(arrf, columns, features_dict):
288+
"""Convert the ARFF object into a pandas DataFrame.
289+
290+
Parameters
291+
----------
292+
arrf : dict
293+
As obtained from liac-arff object.
294+
295+
columns : list
296+
Columns from dataframe to return.
297+
298+
features_dict : dict
299+
Maps feature name to feature info from openml.
300+
301+
Returns
302+
-------
303+
dataframe : pandas DataFrame
304+
"""
305+
pd = check_pandas_support('fetch_openml with as_frame=True')
306+
307+
attributes = OrderedDict(arrf['attributes'])
308+
arrf_columns = list(attributes)
309+
310+
# calculate chunksize
311+
first_row = next(arrf['data'])
312+
first_df = pd.DataFrame([first_row], columns=arrf_columns)
313+
314+
row_bytes = first_df.memory_usage(deep=True).sum()
315+
chunksize = get_chunk_n_rows(row_bytes)
316+
317+
# read arrf data with chunks
318+
columns_to_keep = [col for col in arrf_columns if col in columns]
319+
dfs = []
320+
dfs.append(first_df[columns_to_keep])
321+
for data in _chunk_generator(arrf['data'], chunksize):
322+
dfs.append(pd.DataFrame(data, columns=arrf_columns)[columns_to_keep])
323+
df = pd.concat(dfs)
324+
325+
for column in columns_to_keep:
326+
dtype = _feature_to_dtype(features_dict[column])
327+
if dtype == 'category':
328+
dtype = pd.api.types.CategoricalDtype(attributes[column])
329+
df[column] = df[column].astype(dtype, copy=False)
330+
return df
331+
332+
266333
def _get_data_info_by_name(name, version, data_home):
267334
"""
268335
Utilizes the openml dataset listing api to find a dataset by
@@ -436,7 +503,8 @@ def _valid_data_column_names(features_list, target_columns):
436503

437504

438505
def fetch_openml(name=None, version='active', data_id=None, data_home=None,
439-
target_column='default-target', cache=True, return_X_y=False):
506+
target_column='default-target', cache=True, return_X_y=False,
507+
as_frame=False):
440508
"""Fetch dataset from openml by name or dataset id.
441509
442510
Datasets are uniquely identified by either an integer ID or by a
@@ -489,26 +557,39 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
489557
If True, returns ``(data, target)`` instead of a Bunch object. See
490558
below for more information about the `data` and `target` objects.
491559
560+
as_frame : boolean, default=False
561+
If True, the data is a pandas DataFrame including columns with
562+
appropriate dtypes (numeric, string or categorical). The target is
563+
a pandas DataFrame or Series depending on the number of target_columns.
564+
The Bunch will contain a ``frame`` attribute with the target and the
565+
data. If ``return_X_y`` is True, then ``(data, target)`` will be pandas
566+
DataFrames or Series as describe above.
567+
492568
Returns
493569
-------
494570
495571
data : Bunch
496572
Dictionary-like object, with attributes:
497573
498-
data : np.array or scipy.sparse.csr_matrix of floats
574+
data : np.array, scipy.sparse.csr_matrix of floats, or pandas DataFrame
499575
The feature matrix. Categorical features are encoded as ordinals.
500-
target : np.array
576+
target : np.array, pandas Series or DataFrame
501577
The regression target or classification labels, if applicable.
502-
Dtype is float if numeric, and object if categorical.
578+
Dtype is float if numeric, and object if categorical. If
579+
``as_frame`` is True, ``target`` is a pandas object.
503580
DESCR : str
504581
The full description of the dataset
505582
feature_names : list
506583
The names of the dataset columns
507-
categories : dict
584+
categories : dict or None
508585
Maps each categorical feature name to a list of values, such
509-
that the value encoded as i is ith in the list.
586+
that the value encoded as i is ith in the list. If ``as_frame``
587+
is True, this is None.
510588
details : dict
511589
More metadata from OpenML
590+
frame : pandas DataFrame
591+
Only present when `as_frame=True`. DataFrame with ``data`` and
592+
``target``.
512593
513594
(data, target) : tuple if ``return_X_y`` is True
514595
@@ -568,41 +649,52 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
568649
warn("OpenML raised a warning on the dataset. It might be "
569650
"unusable. Warning: {}".format(data_description['warning']))
570651

652+
return_sparse = False
653+
if data_description['format'].lower() == 'sparse_arff':
654+
return_sparse = True
655+
656+
if as_frame and return_sparse:
657+
raise ValueError('Cannot return dataframe with sparse data')
658+
571659
# download data features, meta-info about column types
572660
features_list = _get_data_features(data_id, data_home)
573661

574-
for feature in features_list:
575-
if 'true' in (feature['is_ignore'], feature['is_row_identifier']):
576-
continue
577-
if feature['data_type'] == 'string':
578-
raise ValueError('STRING attributes are not yet supported')
662+
if not as_frame:
663+
for feature in features_list:
664+
if 'true' in (feature['is_ignore'], feature['is_row_identifier']):
665+
continue
666+
if feature['data_type'] == 'string':
667+
raise ValueError('STRING attributes are not supported for '
668+
'array representation. Try as_frame=True')
579669

580670
if target_column == "default-target":
581671
# determines the default target based on the data feature results
582672
# (which is currently more reliable than the data description;
583673
# see issue: https://github.com/openml/OpenML/issues/768)
584-
target_column = [feature['name'] for feature in features_list
585-
if feature['is_target'] == 'true']
674+
target_columns = [feature['name'] for feature in features_list
675+
if feature['is_target'] == 'true']
586676
elif isinstance(target_column, str):
587677
# for code-simplicity, make target_column by default a list
588-
target_column = [target_column]
678+
target_columns = [target_column]
589679
elif target_column is None:
590-
target_column = []
591-
elif not isinstance(target_column, list):
680+
target_columns = []
681+
elif isinstance(target_column, list):
682+
target_columns = target_column
683+
else:
592684
raise TypeError("Did not recognize type of target_column"
593685
"Should be str, list or None. Got: "
594686
"{}".format(type(target_column)))
595687
data_columns = _valid_data_column_names(features_list,
596-
target_column)
688+
target_columns)
597689

598690
# prepare which columns and data types should be returned for the X and y
599691
features_dict = {feature['name']: feature for feature in features_list}
600692

601693
# XXX: col_slice_y should be all nominal or all numeric
602-
_verify_target_data_type(features_dict, target_column)
694+
_verify_target_data_type(features_dict, target_columns)
603695

604696
col_slice_y = [int(features_dict[col_name]['index'])
605-
for col_name in target_column]
697+
for col_name in target_columns]
606698

607699
col_slice_x = [int(features_dict[col_name]['index'])
608700
for col_name in data_columns]
@@ -615,10 +707,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
615707
'columns. '.format(feat['name'], nr_missing))
616708

617709
# determine arff encoding to return
618-
return_sparse = False
619-
if data_description['format'].lower() == 'sparse_arff':
620-
return_sparse = True
621-
622710
if not return_sparse:
623711
data_qualities = _get_data_qualities(data_id, data_home)
624712
shape = _get_data_shape(data_qualities)
@@ -631,46 +719,62 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
631719

632720
# obtain the data
633721
arff = _download_data_arff(data_description['file_id'], return_sparse,
634-
data_home)
635-
636-
# nominal attributes is a dict mapping from the attribute name to the
637-
# possible values. Includes also the target column (which will be popped
638-
# off below, before it will be packed in the Bunch object)
639-
nominal_attributes = {k: v for k, v in arff['attributes']
640-
if isinstance(v, list) and
641-
k in data_columns + target_column}
642-
643-
X, y = _convert_arff_data(arff['data'], col_slice_x, col_slice_y, shape)
644-
645-
is_classification = {col_name in nominal_attributes
646-
for col_name in target_column}
647-
if not is_classification:
648-
# No target
649-
pass
650-
elif all(is_classification):
651-
y = np.hstack([np.take(np.asarray(nominal_attributes.pop(col_name),
652-
dtype='O'),
653-
y[:, i:i+1].astype(int, copy=False))
654-
for i, col_name in enumerate(target_column)])
655-
elif any(is_classification):
656-
raise ValueError('Mix of nominal and non-nominal targets is not '
657-
'currently supported')
722+
data_home, encode_nominal=not as_frame)
658723

659724
description = "{}\n\nDownloaded from openml.org.".format(
660725
data_description.pop('description'))
661726

662-
# reshape y back to 1-D array, if there is only 1 target column; back
663-
# to None if there are not target columns
664-
if y.shape[1] == 1:
665-
y = y.reshape((-1,))
666-
elif y.shape[1] == 0:
667-
y = None
727+
nominal_attributes = None
728+
frame = None
729+
if as_frame:
730+
columns = data_columns + target_columns
731+
frame = _convert_arff_data_dataframe(arff, columns, features_dict)
732+
X = frame[data_columns]
733+
if len(target_columns) >= 2:
734+
y = frame[target_columns]
735+
elif len(target_columns) == 1:
736+
y = frame[target_columns[0]]
737+
else:
738+
y = None
739+
else:
740+
# nominal attributes is a dict mapping from the attribute name to the
741+
# possible values. Includes also the target column (which will be
742+
# popped off below, before it will be packed in the Bunch object)
743+
nominal_attributes = {k: v for k, v in arff['attributes']
744+
if isinstance(v, list) and
745+
k in data_columns + target_columns}
746+
747+
X, y = _convert_arff_data(arff['data'], col_slice_x,
748+
col_slice_y, shape)
749+
750+
is_classification = {col_name in nominal_attributes
751+
for col_name in target_columns}
752+
if not is_classification:
753+
# No target
754+
pass
755+
elif all(is_classification):
756+
y = np.hstack([
757+
np.take(
758+
np.asarray(nominal_attributes.pop(col_name), dtype='O'),
759+
y[:, i:i + 1].astype(int, copy=False))
760+
for i, col_name in enumerate(target_columns)
761+
])
762+
elif any(is_classification):
763+
raise ValueError('Mix of nominal and non-nominal targets is not '
764+
'currently supported')
765+
766+
# reshape y back to 1-D array, if there is only 1 target column; back
767+
# to None if there are not target columns
768+
if y.shape[1] == 1:
769+
y = y.reshape((-1,))
770+
elif y.shape[1] == 0:
771+
y = None
668772

669773
if return_X_y:
670774
return X, y
671775

672776
bunch = Bunch(
673-
data=X, target=y, feature_names=data_columns,
777+
data=X, target=y, frame=frame, feature_names=data_columns,
674778
DESCR=description, details=data_description,
675779
categories=nominal_attributes,
676780
url="https://www.openml.org/d/{}".format(data_id))
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)
0