From f456216b089cd77d37923b618083114ca192a9bd Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 16 May 2023 16:29:36 -0400 Subject: [PATCH 1/2] TST Fix openml parser implementation for pandas-dev --- sklearn/datasets/_arff_parser.py | 5 +++++ sklearn/datasets/tests/test_openml.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index d085e9d85f2a9..26a394b95f790 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -199,6 +199,11 @@ def _io_to_generator(gzip_file): dfs.append( pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep] ) + # dfs[0] contains only one row, which may not have enough data to infer to + # column's dtype. Here we use `dfs[1]` to configure the dtype in dfs[0] + if len(dfs) >= 2: + dfs[0] = dfs[0].astype(dfs[1].dtypes) + frame = pd.concat(dfs, ignore_index=True) del dfs, first_df diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 4e66f9b434dc5..fef03a5fd4d40 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -925,7 +925,7 @@ def datasets_missing_values(): # with casting it will be transformed to either float or Int64 (40966, "pandas", 1, 77, 0), # titanic - (40945, "liac-arff", 3, 5, 0), + (40945, "liac-arff", 3, 6, 0), (40945, "pandas", 3, 3, 3), ], ) From a54cab08b68a369c6199e5de291ae28afef3927e Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 18 May 2023 13:38:26 -0400 Subject: [PATCH 2/2] DOC Adds whats new --- doc/whats_new/v1.3.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 9cab0db995c5d..80f05f4ad071c 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -213,6 +213,9 @@ Changelog is deprecated and will be removed in v1.5. :pr:`25784` by :user:`Jérémie du Boisberranger`. +- |Fix| :func:`datasets.fetch_openml` returns improved data types when + `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_. + :mod:`sklearn.decomposition` ............................ @@ -409,7 +412,7 @@ Changelog - |API| The `eps` parameter of the :func:`log_loss` has been deprecated and will be removed in 1.5. :pr:`25299` by :user:`Omar Salman `. - + - |Feature| :func:`metrics.average_precision_score` now supports the multiclass case. :pr:`17388` by :user:`Geoffrey Bolmier ` and