@@ -428,6 +428,7 @@ def _load_arff_response(
428
428
md5_checksum : str ,
429
429
n_retries : int = 3 ,
430
430
delay : float = 1.0 ,
431
+ read_csv_kwargs : Optional [Dict ] = None ,
431
432
):
432
433
"""Load the ARFF data associated with the OpenML URL.
433
434
@@ -470,6 +471,18 @@ def _load_arff_response(
470
471
md5_checksum : str
471
472
The MD5 checksum provided by OpenML to check the data integrity.
472
473
474
+ n_retries : int, default=3
475
+ The number of times to retry downloading the data if it fails.
476
+
477
+ delay : float, default=1.0
478
+ The delay between two consecutive downloads in seconds.
479
+
480
+ read_csv_kwargs : dict, default=None
481
+ Keyword arguments to pass to `pandas.read_csv` when using the pandas parser.
482
+ It allows to overwrite the default options.
483
+
484
+ .. versionadded:: 1.3
485
+
473
486
Returns
474
487
-------
475
488
X : {ndarray, sparse matrix, dataframe}
@@ -506,13 +519,14 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
506
519
with closing (gzip_file ):
507
520
return load_arff_from_gzip_file (gzip_file , ** arff_params )
508
521
509
- arff_params = dict (
522
+ arff_params : Dict = dict (
510
523
parser = parser ,
511
524
output_type = output_type ,
512
525
openml_columns_info = openml_columns_info ,
513
526
feature_names_to_select = feature_names_to_select ,
514
527
target_names_to_select = target_names_to_select ,
515
528
shape = shape ,
529
+ read_csv_kwargs = read_csv_kwargs or {},
516
530 )
517
531
try :
518
532
X , y , frame , categories = _open_url_and_load_gzip_file (
@@ -530,7 +544,7 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
530
544
# A parsing error could come from providing the wrong quotechar
531
545
# to pandas. By default, we use a double quote. Thus, we retry
532
546
# with a single quote before to raise the error.
533
- arff_params ["read_csv_kwargs" ] = { "quotechar" : "'" }
547
+ arff_params ["read_csv_kwargs" ]. update ( quotechar = "'" )
534
548
X , y , frame , categories = _open_url_and_load_gzip_file (
535
549
url , data_home , n_retries , delay , arff_params
536
550
)
@@ -552,6 +566,7 @@ def _download_data_to_bunch(
552
566
n_retries : int = 3 ,
553
567
delay : float = 1.0 ,
554
568
parser : str ,
569
+ read_csv_kwargs : Optional [Dict ] = None ,
555
570
):
556
571
"""Download ARFF data, load it to a specific container and create to Bunch.
557
572
@@ -598,6 +613,12 @@ def _download_data_to_bunch(
598
613
parser : {"liac-arff", "pandas"}
599
614
The parser used to parse the ARFF file.
600
615
616
+ read_csv_kwargs : dict, default=None
617
+ Keyword arguments to pass to `pandas.read_csv` when using the pandas parser.
618
+ It allows to overwrite the default options.
619
+
620
+ .. versionadded:: 1.3
621
+
601
622
Returns
602
623
-------
603
624
data : :class:`~sklearn.utils.Bunch`
@@ -657,6 +678,7 @@ def _download_data_to_bunch(
657
678
md5_checksum = md5_checksum ,
658
679
n_retries = n_retries ,
659
680
delay = delay ,
681
+ read_csv_kwargs = read_csv_kwargs ,
660
682
)
661
683
662
684
return Bunch (
@@ -725,6 +747,7 @@ def fetch_openml(
725
747
n_retries : int = 3 ,
726
748
delay : float = 1.0 ,
727
749
parser : Optional [str ] = "warn" ,
750
+ read_csv_kwargs : Optional [Dict ] = None ,
728
751
):
729
752
"""Fetch dataset from openml by name or dataset id.
730
753
@@ -829,6 +852,13 @@ def fetch_openml(
829
852
warning. Therefore, an `ImportError` will be raised from 1.4 if
830
853
the dataset is dense and pandas is not installed.
831
854
855
+ read_csv_kwargs : dict, default=None
856
+ Keyword arguments passed to :func:`pandas.read_csv` when loading the data
857
+ from a ARFF file and using the pandas parser. It can allows to
858
+ overwrite some default parameters.
859
+
860
+ .. versionadded:: 1.3
861
+
832
862
Returns
833
863
-------
834
864
data : :class:`~sklearn.utils.Bunch`
@@ -1096,6 +1126,7 @@ def fetch_openml(
1096
1126
n_retries = n_retries ,
1097
1127
delay = delay ,
1098
1128
parser = parser_ ,
1129
+ read_csv_kwargs = read_csv_kwargs ,
1099
1130
)
1100
1131
1101
1132
if return_X_y :
0 commit comments