ENH allows to overwrite read_csv parameter in fetch_openml (scikit-learn#26433)

glemaitre · thomasjpfan · adrinjalali · manudarmi · commit 0ed43747b6c8 · 2023-06-12T13:42:28.000+02:00
Co-authored-by: Thomas J. Fan &lt;thomasjpfan@gmail.com&gt;
Co-authored-by: Adrin Jalali &lt;adrin.jalali@gmail.com&gt;
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -271,6 +271,11 @@ Changelog
 - |Fix| :func:`datasets.fetch_openml` returns improved data types when
   `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_.
 
+- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using
+  the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the
+  pandas parser.
+  :pr:`26433` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.decomposition`
 ............................
 
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
@@ -428,6 +428,7 @@ def _load_arff_response(
     md5_checksum: str,
     n_retries: int = 3,
     delay: float = 1.0,
+    read_csv_kwargs: Optional[Dict] = None,
 ):
     """Load the ARFF data associated with the OpenML URL.
 
@@ -470,6 +471,18 @@ def _load_arff_response(
     md5_checksum : str
         The MD5 checksum provided by OpenML to check the data integrity.
 
+    n_retries : int, default=3
+        The number of times to retry downloading the data if it fails.
+
+    delay : float, default=1.0
+        The delay between two consecutive downloads in seconds.
+
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv` when using the pandas parser.
+        It allows to overwrite the default options.
+
+        .. versionadded:: 1.3
+
     Returns
     -------
     X : {ndarray, sparse matrix, dataframe}
@@ -506,13 +519,14 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
         with closing(gzip_file):
             return load_arff_from_gzip_file(gzip_file, **arff_params)
 
-    arff_params = dict(
+    arff_params: Dict = dict(
         parser=parser,
         output_type=output_type,
         openml_columns_info=openml_columns_info,
         feature_names_to_select=feature_names_to_select,
         target_names_to_select=target_names_to_select,
         shape=shape,
+        read_csv_kwargs=read_csv_kwargs or {},
     )
     try:
         X, y, frame, categories = _open_url_and_load_gzip_file(
@@ -530,7 +544,7 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
         # A parsing error could come from providing the wrong quotechar
         # to pandas. By default, we use a double quote. Thus, we retry
         # with a single quote before to raise the error.
-        arff_params["read_csv_kwargs"] = {"quotechar": "'"}
+        arff_params["read_csv_kwargs"].update(quotechar="'")
         X, y, frame, categories = _open_url_and_load_gzip_file(
             url, data_home, n_retries, delay, arff_params
         )
@@ -552,6 +566,7 @@ def _download_data_to_bunch(
     n_retries: int = 3,
     delay: float = 1.0,
     parser: str,
+    read_csv_kwargs: Optional[Dict] = None,
 ):
     """Download ARFF data, load it to a specific container and create to Bunch.
 
@@ -598,6 +613,12 @@ def _download_data_to_bunch(
     parser : {"liac-arff", "pandas"}
         The parser used to parse the ARFF file.
 
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv` when using the pandas parser.
+        It allows to overwrite the default options.
+
+        .. versionadded:: 1.3
+
     Returns
     -------
     data : :class:`~sklearn.utils.Bunch`
@@ -657,6 +678,7 @@ def _download_data_to_bunch(
         md5_checksum=md5_checksum,
         n_retries=n_retries,
         delay=delay,
+        read_csv_kwargs=read_csv_kwargs,
     )
 
     return Bunch(
@@ -725,6 +747,7 @@ def fetch_openml(
     n_retries: int = 3,
     delay: float = 1.0,
     parser: Optional[str] = "warn",
+    read_csv_kwargs: Optional[Dict] = None,
 ):
     """Fetch dataset from openml by name or dataset id.
 
@@ -829,6 +852,13 @@ def fetch_openml(
            warning. Therefore, an `ImportError` will be raised from 1.4 if
            the dataset is dense and pandas is not installed.
 
+    read_csv_kwargs : dict, default=None
+        Keyword arguments passed to :func:`pandas.read_csv` when loading the data
+        from a ARFF file and using the pandas parser. It can allows to
+        overwrite some default parameters.
+
+        .. versionadded:: 1.3
+
     Returns
     -------
     data : :class:`~sklearn.utils.Bunch`
@@ -1096,6 +1126,7 @@ def fetch_openml(
         n_retries=n_retries,
         delay=delay,
         parser=parser_,
+        read_csv_kwargs=read_csv_kwargs,
     )
 
     if return_X_y:
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
@@ -1354,6 +1354,34 @@ def test_dataset_with_openml_warning(monkeypatch, gzip_response):
         fetch_openml(data_id=data_id, cache=False, as_frame=False, parser="liac-arff")
 
 
+def test_fetch_openml_overwrite_default_params_read_csv(monkeypatch):
+    """Check that we can overwrite the default parameters of `read_csv`."""
+    pytest.importorskip("pandas")
+    data_id = 1590
+    _monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)
+
+    common_params = {
+        "data_id": data_id,
+        "as_frame": True,
+        "cache": False,
+        "parser": "pandas",
+    }
+
+    # By default, the initial spaces are skipped. We checked that setting the parameter
+    # `skipinitialspace` to False will have an effect.
+    adult_without_spaces = fetch_openml(**common_params)
+    adult_with_spaces = fetch_openml(
+        **common_params, read_csv_kwargs={"skipinitialspace": False}
+    )
+    assert all(
+        cat.startswith(" ") for cat in adult_with_spaces.frame["class"].cat.categories
+    )
+    assert not any(
+        cat.startswith(" ")
+        for cat in adult_without_spaces.frame["class"].cat.categories
+    )
+
+
 ###############################################################################
 # Test cache, retry mechanisms, checksum, etc.