scikit-learn · lesteve · May 23, 2023 · May 16, 2023 · May 18, 2023 · ogrisel
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -213,6 +213,9 @@ Changelog
   is deprecated and will be removed in v1.5.
   :pr:`25784` by :user:`Jérémie du Boisberranger`.
 
+- |Fix| :func:`datasets.fetch_openml` returns improved data types when
+  `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_.
+
 :mod:`sklearn.decomposition`
 ............................
 
@@ -409,7 +412,7 @@ Changelog
 
 - |API| The `eps` parameter of the :func:`log_loss` has been deprecated and will be
   removed in 1.5. :pr:`25299` by :user:`Omar Salman <OmarManzoor>`.
-  
+
 - |Feature| :func:`metrics.average_precision_score` now supports the
   multiclass case.
   :pr:`17388` by :user:`Geoffrey Bolmier <gbolmier>` and

diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py
@@ -199,6 +199,11 @@ def _io_to_generator(gzip_file):
             dfs.append(
                 pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep]
             )
+        # dfs[0] contains only one row, which may not have enough data to infer to
+        # column's dtype. Here we use `dfs[1]` to configure the dtype in dfs[0]
+        if len(dfs) >= 2:
+            dfs[0] = dfs[0].astype(dfs[1].dtypes)
+
         frame = pd.concat(dfs, ignore_index=True)
         del dfs, first_df
 

diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
@@ -925,7 +925,7 @@ def datasets_missing_values():
         # with casting it will be transformed to either float or Int64
         (40966, "pandas", 1, 77, 0),
         # titanic
-        (40945, "liac-arff", 3, 5, 0),
+        (40945, "liac-arff", 3, 6, 0),
         (40945, "pandas", 3, 3, 3),
     ],
 )