diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 135522ebd5..53490d7771 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1669,17 +1669,27 @@ def to_pandas( Args: max_download_size (int, default None): - Download size threshold in MB. If max_download_size is exceeded when downloading data - (e.g., to_pandas()), the data will be downsampled if - bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be - raised. If set to a value other than None, this will supersede the global config. + .. deprecated:: 2.0.0 + ``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()`` + method instead. + + Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data, + the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is + ``True``, otherwise, an error will be raised. If set to a value other than ``None``, + this will supersede the global config. sampling_method (str, default None): + .. deprecated:: 2.0.0 + ``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead. + Downsampling algorithms to be chosen from, the choices are: "head": This algorithm returns a portion of the data from the beginning. It is fast and requires minimal computations to perform the downsampling; "uniform": This algorithm returns uniform random samples of the data. If set to a value other than None, this will supersede the global config. random_state (int, default None): + .. deprecated:: 2.0.0 + ``random_state`` parameter is deprecated. Please use ``sample()`` method instead. + The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. @@ -1699,6 +1709,20 @@ def to_pandas( downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas Series containing dry run statistics will be returned. """ + if max_download_size is not None: + msg = bfe.format_message( + "DEPRECATED: The `max_download_size` parameters for `DataFrame.to_pandas()` " + "are deprecated and will be removed soon. Please use `DataFrame.to_pandas_batches()`." + ) + warnings.warn(msg, category=FutureWarning) + if sampling_method is not None or random_state is not None: + msg = bfe.format_message( + "DEPRECATED: The `sampling_method` and `random_state` parameters for " + "`DataFrame.to_pandas()` are deprecated and will be removed soon. " + "Please use `DataFrame.sample().to_pandas()` instead for sampling." + ) + warnings.warn(msg, category=FutureWarning, stacklevel=2) + if dry_run: dry_run_stats, dry_run_job = self._block._compute_dry_run( max_download_size=max_download_size, diff --git a/bigframes/series.py b/bigframes/series.py index 882c601b7c..559f7ef48e 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -35,6 +35,7 @@ Tuple, Union, ) +import warnings import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.series as vendored_pandas_series @@ -61,6 +62,7 @@ import bigframes.core.window_spec as windows import bigframes.dataframe import bigframes.dtypes +import bigframes.exceptions as bfe import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -432,17 +434,27 @@ def to_pandas( Args: max_download_size (int, default None): - Download size threshold in MB. If max_download_size is exceeded when downloading data - (e.g., to_pandas()), the data will be downsampled if - bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be - raised. If set to a value other than None, this will supersede the global config. + .. deprecated:: 2.0.0 + ``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()`` + method instead. + + Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data, + the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is + ``True``, otherwise, an error will be raised. If set to a value other than ``None``, + this will supersede the global config. sampling_method (str, default None): + .. deprecated:: 2.0.0 + ``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead. + Downsampling algorithms to be chosen from, the choices are: "head": This algorithm returns a portion of the data from the beginning. It is fast and requires minimal computations to perform the downsampling; "uniform": This algorithm returns uniform random samples of the data. If set to a value other than None, this will supersede the global config. random_state (int, default None): + .. deprecated:: 2.0.0 + ``random_state`` parameter is deprecated. Please use ``sample()`` method instead. + The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. @@ -461,6 +473,19 @@ def to_pandas( is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. If dry_run is set to True, a pandas Series containing dry run statistics will be returned. """ + if max_download_size is not None: + msg = bfe.format_message( + "DEPRECATED: The `max_download_size` parameters for `Series.to_pandas()` " + "are deprecated and will be removed soon. Please use `Series.to_pandas_batches()`." + ) + warnings.warn(msg, category=FutureWarning) + if sampling_method is not None or random_state is not None: + msg = bfe.format_message( + "DEPRECATED: The `sampling_method` and `random_state` parameters for " + "`Series.to_pandas()` are deprecated and will be removed soon. " + "Please use `Series.sample().to_pandas()` instead for sampling." + ) + warnings.warn(msg, category=FutureWarning) if dry_run: dry_run_stats, dry_run_job = self._block._compute_dry_run( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 914c953f99..e8316f253b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5203,9 +5203,7 @@ def test_query_complexity_repeated_subtrees( # See: https://github.com/python/cpython/issues/112282 reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", ) -def test_query_complexity_repeated_analytic( - scalars_df_index, scalars_pandas_df_index, with_multiquery_execution -): +def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_index): bf_df = scalars_df_index[["int64_col", "int64_too"]] pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]] # Uses LAG analytic operator, each in a new SELECT @@ -5217,22 +5215,6 @@ def test_query_complexity_repeated_analytic( assert_pandas_df_equal(bf_result, pd_result) -def test_to_pandas_downsampling_option_override(session): - df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") - download_size = 1 - - # limits only apply for allow_large_result=True - df = df.to_pandas( - max_download_size=download_size, - sampling_method="head", - allow_large_results=True, - ) - - total_memory_bytes = df.memory_usage(deep=True).sum() - total_memory_mb = total_memory_bytes / (1024 * 1024) - assert total_memory_mb == pytest.approx(download_size, rel=0.5) - - def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created): dataset_id = dataset_id_not_created destination_table = f"{dataset_id}.scalars_df" diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 9886102e2e..a69c26bc54 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -266,6 +266,62 @@ def test_to_pandas_override_global_option(scalars_df_index): assert scalars_df_index._query_job.destination.table_id == table_id +def test_to_pandas_downsampling_option_override(session): + df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") + download_size = 1 + + with pytest.warns( + UserWarning, match="The data size .* exceeds the maximum download limit" + ): + # limits only apply for allow_large_result=True + df = df.to_pandas( + max_download_size=download_size, + sampling_method="head", + allow_large_results=True, + ) + + total_memory_bytes = df.memory_usage(deep=True).sum() + total_memory_mb = total_memory_bytes / (1024 * 1024) + assert total_memory_mb == pytest.approx(download_size, rel=0.5) + + +@pytest.mark.parametrize( + ("kwargs", "message"), + [ + pytest.param( + {"sampling_method": "head"}, + r"DEPRECATED[\S\s]*sampling_method[\S\s]*DataFrame.sample", + id="sampling_method", + ), + pytest.param( + {"random_state": 10}, + r"DEPRECATED[\S\s]*random_state[\S\s]*DataFrame.sample", + id="random_state", + ), + pytest.param( + {"max_download_size": 10}, + r"DEPRECATED[\S\s]*max_download_size[\S\s]*DataFrame.to_pandas_batches", + id="max_download_size", + ), + ], +) +def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message): + with pytest.warns(FutureWarning, match=message): + scalars_df_index.to_pandas( + # limits only apply for allow_large_result=True + allow_large_results=True, + **kwargs, + ) + + +def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index): + bf_df = session.read_pandas(scalars_pandas_df_multi_index) + + result = bf_df.to_pandas(dry_run=True) + + assert len(result) == 14 + + def test_to_arrow_override_global_option(scalars_df_index): # Direct call to_arrow uses global default setting (allow_large_results=True), with bigframes.option_context("bigquery.allow_large_results", True): @@ -813,11 +869,3 @@ def test_to_sql_query_named_index_excluded( utils.assert_pandas_df_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True ) - - -def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index): - bf_df = session.read_pandas(scalars_pandas_df_multi_index) - - result = bf_df.to_pandas(dry_run=True) - - assert len(result) == 14 diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 535e4bc9ae..9f45c8465b 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -426,11 +426,3 @@ def test_multiindex_repr_includes_all_names(session): ) index = session.read_pandas(df).set_index(["A", "B"]).index assert "names=['A', 'B']" in repr(index) - - -def test_to_pandas_dry_run(scalars_df_index): - index = scalars_df_index.index - - result = index.to_pandas(dry_run=True) - - assert len(result) == 14 diff --git a/tests/system/small/test_index_io.py b/tests/system/small/test_index_io.py index 3bf9794f5a..fcb3fa3920 100644 --- a/tests/system/small/test_index_io.py +++ b/tests/system/small/test_index_io.py @@ -30,6 +30,14 @@ def test_to_pandas_override_global_option(scalars_df_index): assert bf_index._query_job.destination.table_id == table_id +def test_to_pandas_dry_run(scalars_df_index): + index = scalars_df_index.index + + result = index.to_pandas(dry_run=True) + + assert len(result) == 14 + + def test_to_numpy_override_global_option(scalars_df_index): with bigframes.option_context("bigquery.allow_large_results", True): diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py index 8a699aed73..235ae65750 100644 --- a/tests/system/small/test_series_io.py +++ b/tests/system/small/test_series_io.py @@ -15,6 +15,7 @@ import pytest import bigframes +import bigframes.series def test_to_pandas_override_global_option(scalars_df_index): @@ -37,6 +38,36 @@ def test_to_pandas_override_global_option(scalars_df_index): assert session._metrics.execution_count - execution_count == 1 +@pytest.mark.parametrize( + ("kwargs", "message"), + [ + pytest.param( + {"sampling_method": "head"}, + r"DEPRECATED[\S\s]*sampling_method[\S\s]*Series.sample", + id="sampling_method", + ), + pytest.param( + {"random_state": 10}, + r"DEPRECATED[\S\s]*random_state[\S\s]*Series.sample", + id="random_state", + ), + pytest.param( + {"max_download_size": 10}, + r"DEPRECATED[\S\s]*max_download_size[\S\s]*Series.to_pandas_batches", + id="max_download_size", + ), + ], +) +def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message): + s: bigframes.series.Series = scalars_df_index["int64_col"] + with pytest.warns(FutureWarning, match=message): + s.to_pandas( + # limits only apply for allow_large_result=True + allow_large_results=True, + **kwargs, + ) + + @pytest.mark.parametrize( ("page_size", "max_results", "allow_large_results"), [