From 17354666baf58c4dc97f0701a786d97a6386937f Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 3 Apr 2025 05:58:14 +0000 Subject: [PATCH 1/4] feat: warn deprecated parameters --- bigframes/dataframe.py | 24 ++++++++++ bigframes/series.py | 25 ++++++++++ tests/system/small/test_dataframe.py | 20 +------- tests/system/small/test_dataframe_io.py | 64 +++++++++++++++++++++---- tests/system/small/test_index.py | 8 ---- tests/system/small/test_index_io.py | 8 ++++ tests/system/small/test_series_io.py | 33 +++++++++++++ 7 files changed, 147 insertions(+), 35 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 135522ebd5..72ec42c247 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1669,17 +1669,27 @@ def to_pandas( Args: max_download_size (int, default None): + .. deprecated:: 2.0.0 + `max_download_size` parameter is deprecated. Please use `to_pandas_batch()` method + instead. + Download size threshold in MB. If max_download_size is exceeded when downloading data (e.g., to_pandas()), the data will be downsampled if bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be raised. If set to a value other than None, this will supersede the global config. sampling_method (str, default None): + .. deprecated:: 2.0.0 + `sampling_method` parameter is deprecated. Please use `sample()` method instead. + Downsampling algorithms to be chosen from, the choices are: "head": This algorithm returns a portion of the data from the beginning. It is fast and requires minimal computations to perform the downsampling; "uniform": This algorithm returns uniform random samples of the data. If set to a value other than None, this will supersede the global config. random_state (int, default None): + .. deprecated:: 2.0.0 + `random_state` parameter is deprecated. Please use `sample()` method instead. + The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. @@ -1699,6 +1709,20 @@ def to_pandas( downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas Series containing dry run statistics will be returned. """ + if max_download_size is not None: + msg = bfe.format_message( + "DEPRECATED: The `max_download_size` parameters for `DataFrame.to_pandas()` " + "are deprecated and will be removed soon. Please use `DataFrame.to_pandas_batch()`." + ) + warnings.warn(msg, category=UserWarning) + if sampling_method is not None or random_state is not None: + msg = bfe.format_message( + "DEPRECATED: The `sampling_method` and `random_state` parameters for " + "`DataFrame.to_pandas()` are deprecated and will be removed soon. " + "Please use `DataFrame.sample().to_pandas()` instead for sampling." + ) + warnings.warn(msg, category=UserWarning, stacklevel=2) + if dry_run: dry_run_stats, dry_run_job = self._block._compute_dry_run( max_download_size=max_download_size, diff --git a/bigframes/series.py b/bigframes/series.py index 305bc93a09..ccc6e3c7b6 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -35,6 +35,7 @@ Tuple, Union, ) +import warnings import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.series as vendored_pandas_series @@ -60,6 +61,7 @@ import bigframes.core.window_spec as windows import bigframes.dataframe import bigframes.dtypes +import bigframes.exceptions as bfe import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -431,17 +433,27 @@ def to_pandas( Args: max_download_size (int, default None): + .. deprecated:: 2.0.0 + `max_download_size` parameter is deprecated. Please use `to_pandas_batch()` method + instead. + Download size threshold in MB. If max_download_size is exceeded when downloading data (e.g., to_pandas()), the data will be downsampled if bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be raised. If set to a value other than None, this will supersede the global config. sampling_method (str, default None): + .. deprecated:: 2.0.0 + `sampling_method` parameter is deprecated. Please use `sample()` method instead. + Downsampling algorithms to be chosen from, the choices are: "head": This algorithm returns a portion of the data from the beginning. It is fast and requires minimal computations to perform the downsampling; "uniform": This algorithm returns uniform random samples of the data. If set to a value other than None, this will supersede the global config. random_state (int, default None): + .. deprecated:: 2.0.0 + `random_state` parameter is deprecated. Please use `sample()` method instead. + The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than None, this will supersede the global config. @@ -460,6 +472,19 @@ def to_pandas( is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. If dry_run is set to True, a pandas Series containing dry run statistics will be returned. """ + if max_download_size is not None: + msg = bfe.format_message( + "DEPRECATED: The `max_download_size` parameters for `Series.to_pandas()` " + "are deprecated and will be removed soon. Please use `Series.to_pandas_batch()`." + ) + warnings.warn(msg, category=UserWarning) + if sampling_method is not None or random_state is not None: + msg = bfe.format_message( + "DEPRECATED: The `sampling_method` and `random_state` parameters for " + "`Series.to_pandas()` are deprecated and will be removed soon. " + "Please use `Series.sample().to_pandas()` instead for sampling." + ) + warnings.warn(msg, category=UserWarning) if dry_run: dry_run_stats, dry_run_job = self._block._compute_dry_run( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 914c953f99..e8316f253b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5203,9 +5203,7 @@ def test_query_complexity_repeated_subtrees( # See: https://github.com/python/cpython/issues/112282 reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", ) -def test_query_complexity_repeated_analytic( - scalars_df_index, scalars_pandas_df_index, with_multiquery_execution -): +def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_index): bf_df = scalars_df_index[["int64_col", "int64_too"]] pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]] # Uses LAG analytic operator, each in a new SELECT @@ -5217,22 +5215,6 @@ def test_query_complexity_repeated_analytic( assert_pandas_df_equal(bf_result, pd_result) -def test_to_pandas_downsampling_option_override(session): - df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") - download_size = 1 - - # limits only apply for allow_large_result=True - df = df.to_pandas( - max_download_size=download_size, - sampling_method="head", - allow_large_results=True, - ) - - total_memory_bytes = df.memory_usage(deep=True).sum() - total_memory_mb = total_memory_bytes / (1024 * 1024) - assert total_memory_mb == pytest.approx(download_size, rel=0.5) - - def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created): dataset_id = dataset_id_not_created destination_table = f"{dataset_id}.scalars_df" diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 9886102e2e..22e05727ba 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -266,6 +266,62 @@ def test_to_pandas_override_global_option(scalars_df_index): assert scalars_df_index._query_job.destination.table_id == table_id +def test_to_pandas_downsampling_option_override(session): + df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") + download_size = 1 + + with pytest.warns( + UserWarning, match="The data size .* exceeds the maximum download limit" + ): + # limits only apply for allow_large_result=True + df = df.to_pandas( + max_download_size=download_size, + sampling_method="head", + allow_large_results=True, + ) + + total_memory_bytes = df.memory_usage(deep=True).sum() + total_memory_mb = total_memory_bytes / (1024 * 1024) + assert total_memory_mb == pytest.approx(download_size, rel=0.5) + + +@pytest.mark.parametrize( + ("kwargs", "message"), + [ + pytest.param( + {"sampling_method": "head"}, + r"DEPRECATED[\S\s]*sampling_method[\S\s]*DataFrame", + id="sampling_method", + ), + pytest.param( + {"random_state": 10}, + r"DEPRECATED[\S\s]*random_state[\S\s]*DataFrame", + id="random_state", + ), + pytest.param( + {"max_download_size": 10}, + r"DEPRECATED[\S\s]*max_download_size[\S\s]*DataFrame", + id="max_download_size", + ), + ], +) +def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message): + with pytest.warns(UserWarning, match=message): + scalars_df_index.to_pandas( + # limits only apply for allow_large_result=True + allow_large_results=True, + **kwargs, + ) + + +def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index): + bf_df = session.read_pandas(scalars_pandas_df_multi_index) + + result = bf_df.to_pandas(dry_run=True) + + assert len(result) == 14 + + def test_to_arrow_override_global_option(scalars_df_index): # Direct call to_arrow uses global default setting (allow_large_results=True), with bigframes.option_context("bigquery.allow_large_results", True): @@ -813,11 +869,3 @@ def test_to_sql_query_named_index_excluded( utils.assert_pandas_df_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True ) - - -def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index): - bf_df = session.read_pandas(scalars_pandas_df_multi_index) - - result = bf_df.to_pandas(dry_run=True) - - assert len(result) == 14 diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 535e4bc9ae..9f45c8465b 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -426,11 +426,3 @@ def test_multiindex_repr_includes_all_names(session): ) index = session.read_pandas(df).set_index(["A", "B"]).index assert "names=['A', 'B']" in repr(index) - - -def test_to_pandas_dry_run(scalars_df_index): - index = scalars_df_index.index - - result = index.to_pandas(dry_run=True) - - assert len(result) == 14 diff --git a/tests/system/small/test_index_io.py b/tests/system/small/test_index_io.py index 3bf9794f5a..fcb3fa3920 100644 --- a/tests/system/small/test_index_io.py +++ b/tests/system/small/test_index_io.py @@ -30,6 +30,14 @@ def test_to_pandas_override_global_option(scalars_df_index): assert bf_index._query_job.destination.table_id == table_id +def test_to_pandas_dry_run(scalars_df_index): + index = scalars_df_index.index + + result = index.to_pandas(dry_run=True) + + assert len(result) == 14 + + def test_to_numpy_override_global_option(scalars_df_index): with bigframes.option_context("bigquery.allow_large_results", True): diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py index 8a699aed73..decdd61e75 100644 --- a/tests/system/small/test_series_io.py +++ b/tests/system/small/test_series_io.py @@ -14,7 +14,10 @@ import pandas as pd import pytest +import pytest + import bigframes +import bigframes.series def test_to_pandas_override_global_option(scalars_df_index): @@ -37,6 +40,36 @@ def test_to_pandas_override_global_option(scalars_df_index): assert session._metrics.execution_count - execution_count == 1 +@pytest.mark.parametrize( + ("kwargs", "message"), + [ + pytest.param( + {"sampling_method": "head"}, + r"DEPRECATED[\S\s]*sampling_method[\S\s]*Series", + id="sampling_method", + ), + pytest.param( + {"random_state": 10}, + r"DEPRECATED[\S\s]*random_state[\S\s]*Series", + id="random_state", + ), + pytest.param( + {"max_download_size": 10}, + r"DEPRECATED[\S\s]*max_download_size[\S\s]*Series", + id="max_download_size", + ), + ], +) +def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message): + s: bigframes.series.Series = scalars_df_index["int64_col"] + with pytest.warns(UserWarning, match=message): + s.to_pandas( + # limits only apply for allow_large_result=True + allow_large_results=True, + **kwargs, + ) + + @pytest.mark.parametrize( ("page_size", "max_results", "allow_large_results"), [ From 626d432242f8ead8de0ac5fee51795446d61d9d6 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 7 Apr 2025 22:57:33 +0000 Subject: [PATCH 2/4] fix tests --- bigframes/dataframe.py | 18 +++++++++--------- bigframes/series.py | 18 +++++++++--------- tests/system/small/test_dataframe_io.py | 6 +++--- tests/system/small/test_series_io.py | 8 +++----- 4 files changed, 24 insertions(+), 26 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 72ec42c247..7f490c7dac 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1670,16 +1670,16 @@ def to_pandas( Args: max_download_size (int, default None): .. deprecated:: 2.0.0 - `max_download_size` parameter is deprecated. Please use `to_pandas_batch()` method - instead. + ``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()`` + method instead. - Download size threshold in MB. If max_download_size is exceeded when downloading data - (e.g., to_pandas()), the data will be downsampled if - bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be - raised. If set to a value other than None, this will supersede the global config. + Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data, + the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is + ``True``, otherwise, an error will be raised. If set to a value other than ``None``, + this will supersede the global config. sampling_method (str, default None): .. deprecated:: 2.0.0 - `sampling_method` parameter is deprecated. Please use `sample()` method instead. + ``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead. Downsampling algorithms to be chosen from, the choices are: "head": This algorithm returns a portion of the data from the beginning. It is fast and requires minimal @@ -1688,7 +1688,7 @@ def to_pandas( the global config. random_state (int, default None): .. deprecated:: 2.0.0 - `random_state` parameter is deprecated. Please use `sample()` method instead. + ``random_state`` parameter is deprecated. Please use ``sample()`` method instead. The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than @@ -1712,7 +1712,7 @@ def to_pandas( if max_download_size is not None: msg = bfe.format_message( "DEPRECATED: The `max_download_size` parameters for `DataFrame.to_pandas()` " - "are deprecated and will be removed soon. Please use `DataFrame.to_pandas_batch()`." + "are deprecated and will be removed soon. Please use `DataFrame.to_pandas_batches()`." ) warnings.warn(msg, category=UserWarning) if sampling_method is not None or random_state is not None: diff --git a/bigframes/series.py b/bigframes/series.py index ccc6e3c7b6..e139a4754c 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -434,16 +434,16 @@ def to_pandas( Args: max_download_size (int, default None): .. deprecated:: 2.0.0 - `max_download_size` parameter is deprecated. Please use `to_pandas_batch()` method - instead. + ``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()`` + method instead. - Download size threshold in MB. If max_download_size is exceeded when downloading data - (e.g., to_pandas()), the data will be downsampled if - bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be - raised. If set to a value other than None, this will supersede the global config. + Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data, + the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is + ``True``, otherwise, an error will be raised. If set to a value other than ``None``, + this will supersede the global config. sampling_method (str, default None): .. deprecated:: 2.0.0 - `sampling_method` parameter is deprecated. Please use `sample()` method instead. + ``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead. Downsampling algorithms to be chosen from, the choices are: "head": This algorithm returns a portion of the data from the beginning. It is fast and requires minimal @@ -452,7 +452,7 @@ def to_pandas( the global config. random_state (int, default None): .. deprecated:: 2.0.0 - `random_state` parameter is deprecated. Please use `sample()` method instead. + ``random_state`` parameter is deprecated. Please use ``sample()`` method instead. The seed for the uniform downsampling algorithm. If provided, the uniform method may take longer to execute and require more computation. If set to a value other than @@ -475,7 +475,7 @@ def to_pandas( if max_download_size is not None: msg = bfe.format_message( "DEPRECATED: The `max_download_size` parameters for `Series.to_pandas()` " - "are deprecated and will be removed soon. Please use `Series.to_pandas_batch()`." + "are deprecated and will be removed soon. Please use `Series.to_pandas_batches()`." ) warnings.warn(msg, category=UserWarning) if sampling_method is not None or random_state is not None: diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 22e05727ba..2269b5a7b3 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -290,17 +290,17 @@ def test_to_pandas_downsampling_option_override(session): [ pytest.param( {"sampling_method": "head"}, - r"DEPRECATED[\S\s]*sampling_method[\S\s]*DataFrame", + r"DEPRECATED[\S\s]*sampling_method[\S\s]*DataFrame.sample", id="sampling_method", ), pytest.param( {"random_state": 10}, - r"DEPRECATED[\S\s]*random_state[\S\s]*DataFrame", + r"DEPRECATED[\S\s]*random_state[\S\s]*DataFrame.sample", id="random_state", ), pytest.param( {"max_download_size": 10}, - r"DEPRECATED[\S\s]*max_download_size[\S\s]*DataFrame", + r"DEPRECATED[\S\s]*max_download_size[\S\s]*DataFrame.to_pandas_batches", id="max_download_size", ), ], diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py index decdd61e75..34261d3c74 100644 --- a/tests/system/small/test_series_io.py +++ b/tests/system/small/test_series_io.py @@ -14,8 +14,6 @@ import pandas as pd import pytest -import pytest - import bigframes import bigframes.series @@ -45,17 +43,17 @@ def test_to_pandas_override_global_option(scalars_df_index): [ pytest.param( {"sampling_method": "head"}, - r"DEPRECATED[\S\s]*sampling_method[\S\s]*Series", + r"DEPRECATED[\S\s]*sampling_method[\S\s]*Series.sample", id="sampling_method", ), pytest.param( {"random_state": 10}, - r"DEPRECATED[\S\s]*random_state[\S\s]*Series", + r"DEPRECATED[\S\s]*random_state[\S\s]*Series.sample", id="random_state", ), pytest.param( {"max_download_size": 10}, - r"DEPRECATED[\S\s]*max_download_size[\S\s]*Series", + r"DEPRECATED[\S\s]*max_download_size[\S\s]*Series.to_pandas_batches", id="max_download_size", ), ], From 6f003f528451103d9f56002f400bca88d443d80c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 8 Apr 2025 09:58:00 -0500 Subject: [PATCH 3/4] use FutureWarning instead of UserWarning See: https://docs.python.org/3/library/exceptions.html#FutureWarning --- bigframes/dataframe.py | 4 ++-- bigframes/series.py | 4 ++-- tests/system/small/test_dataframe_io.py | 4 ++-- tests/system/small/test_series_io.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7f490c7dac..53490d7771 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1714,14 +1714,14 @@ def to_pandas( "DEPRECATED: The `max_download_size` parameters for `DataFrame.to_pandas()` " "are deprecated and will be removed soon. Please use `DataFrame.to_pandas_batches()`." ) - warnings.warn(msg, category=UserWarning) + warnings.warn(msg, category=FutureWarning) if sampling_method is not None or random_state is not None: msg = bfe.format_message( "DEPRECATED: The `sampling_method` and `random_state` parameters for " "`DataFrame.to_pandas()` are deprecated and will be removed soon. " "Please use `DataFrame.sample().to_pandas()` instead for sampling." ) - warnings.warn(msg, category=UserWarning, stacklevel=2) + warnings.warn(msg, category=FutureWarning, stacklevel=2) if dry_run: dry_run_stats, dry_run_job = self._block._compute_dry_run( diff --git a/bigframes/series.py b/bigframes/series.py index e139a4754c..1445f9cf1d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -477,14 +477,14 @@ def to_pandas( "DEPRECATED: The `max_download_size` parameters for `Series.to_pandas()` " "are deprecated and will be removed soon. Please use `Series.to_pandas_batches()`." ) - warnings.warn(msg, category=UserWarning) + warnings.warn(msg, category=FutureWarning) if sampling_method is not None or random_state is not None: msg = bfe.format_message( "DEPRECATED: The `sampling_method` and `random_state` parameters for " "`Series.to_pandas()` are deprecated and will be removed soon. " "Please use `Series.sample().to_pandas()` instead for sampling." ) - warnings.warn(msg, category=UserWarning) + warnings.warn(msg, category=FutureWarning) if dry_run: dry_run_stats, dry_run_job = self._block._compute_dry_run( diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 2269b5a7b3..248ea79454 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -271,7 +271,7 @@ def test_to_pandas_downsampling_option_override(session): download_size = 1 with pytest.warns( - UserWarning, match="The data size .* exceeds the maximum download limit" + FutureWarning, match="The data size .* exceeds the maximum download limit" ): # limits only apply for allow_large_result=True df = df.to_pandas( @@ -306,7 +306,7 @@ def test_to_pandas_downsampling_option_override(session): ], ) def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message): - with pytest.warns(UserWarning, match=message): + with pytest.warns(FutureWarning, match=message): scalars_df_index.to_pandas( # limits only apply for allow_large_result=True allow_large_results=True, diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py index 34261d3c74..235ae65750 100644 --- a/tests/system/small/test_series_io.py +++ b/tests/system/small/test_series_io.py @@ -60,7 +60,7 @@ def test_to_pandas_override_global_option(scalars_df_index): ) def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message): s: bigframes.series.Series = scalars_df_index["int64_col"] - with pytest.warns(UserWarning, match=message): + with pytest.warns(FutureWarning, match=message): s.to_pandas( # limits only apply for allow_large_result=True allow_large_results=True, From a22a47001ff056c8b593753474758f5ba344925c Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 8 Apr 2025 17:39:17 +0000 Subject: [PATCH 4/4] fix tests --- tests/system/small/test_dataframe_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 248ea79454..a69c26bc54 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -271,7 +271,7 @@ def test_to_pandas_downsampling_option_override(session): download_size = 1 with pytest.warns( - FutureWarning, match="The data size .* exceeds the maximum download limit" + UserWarning, match="The data size .* exceeds the maximum download limit" ): # limits only apply for allow_large_result=True df = df.to_pandas(