8000 feat: warn the deprecated `max_download_size`, `random_state` and `sampling_method` parameters in `(DataFrame|Series).to_pandas()` by chelsea-lin · Pull Request #1573 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1669,17 +1669,27 @@ def to_pandas(

Args:
max_download_size (int, default None):
Download size threshold in MB. If max_download_size is exceeded when downloading data
(e.g., to_pandas()), the data will be downsampled if
bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be
raised. If set to a value other than None, this will supersede the global config.
.. deprecated:: 2.0.0
``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()``
method instead.

Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data,
the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is
``True``, otherwise, an error will be raised. If set to a value other than ``None``,
this will supersede the global config.
sampling_method (str, default None):
.. deprecated:: 2.0.0
``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead.

Downsampling algorithms to be chosen from, the choices are: "head": This algorithm
returns a portion of the data from the beginning. It is fast and requires minimal
computations to perform the downsampling; "uniform": This algorithm returns uniform
random samples of the data. If set to a value other than None, this will supersede
the global config.
random_state (int, default None):
.. deprecated:: 2.0.0
``random_state`` parameter is deprecated. Please use ``sample()`` method instead.

The seed for the uniform downsampling algorithm. If provided, the uniform method may
take longer to execute and require more computation. If set to a value other than
None, this will supersede the global config.
Expand All @@ -1699,6 +1709,20 @@ def to_pandas(
downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas
Series containing dry run statistics will be returned.
"""
if max_download_size is not None:
msg = bfe.format_message(
"DEPRECATED: The `max_download_size` parameters for `DataFrame.to_pandas()` "
"are deprecated and will be removed soon. Please use `DataFrame.to_pandas_batches()`."
)
warnings.warn(msg, category=FutureWarning)
if sampling_method is not None or random_state is not None:
msg = bfe.format_message(
"DEPRECATED: The `sampling_method` and `random_state` parameters for "
"`DataFrame.to_pandas()` are deprecated and will be removed soon. "
"Please use `DataFrame.sample().to_pandas()` instead for sampling."
)
warnings.warn(msg, category=FutureWarning, stacklevel=2)

if dry_run:
dry_run_stats, dry_run_job = self._block._compute_dry_run(
max_download_size=max_download_size,
Expand Down
33 changes: 29 additions & 4 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
Tuple,
Union,
)
import warnings

import bigframes_vendored.constants as constants
import bigframes_vendored.pandas.core.series as vendored_pandas_series
Expand All @@ -61,6 +62,7 @@
import bigframes.core.window_spec as windows
import bigframes.dataframe
import bigframes.dtypes
import bigframes.exceptions as bfe
import bigframes.formatting_helpers as formatter
import bigframes.opera 8000 tions as ops
import bigframes.operations.aggregations as agg_ops
Expand Down Expand Up @@ -432,17 +434,27 @@ def to_pandas(

Args:
max_download_size (int, default None):
Download size threshold in MB. If max_download_size is exceeded when downloading data
(e.g., to_pandas()), the data will be downsampled if
bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be
raised. If set to a value other than None, this will supersede the global config.
.. deprecated:: 2.0.0
``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()``
method instead.

Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data,
the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is
``True``, otherwise, an error will be raised. If set to a value other than ``None``,
this will supersede the global config.
sampling_method (str, default None):
.. deprecated:: 2.0.0
``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead.

Downsampling algorithms to be chosen from, the choices are: "head": This algorithm
returns a portion of the data from the beginning. It is fast and requires minimal
computations to perform the downsampling; "uniform": This algorithm returns uniform
random samples of the data. If set to a value other than None, this will supersede
the global config.
random_state (int, default None):
.. deprecated:: 2.0.0
``random_state`` parameter is deprecated. Please use ``sample()`` method instead.

The seed for the uniform downsampling algorithm. If provided, the uniform method may
take longer to execute and require more computation. If set to a value other than
None, this will supersede the global config.
Expand All @@ -461,6 +473,19 @@ def to_pandas(
is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. If dry_run
is set to True, a pandas Series containing dry run statistics will be returned.
"""
if max_download_size is not None:
msg = bfe.format_message(
"DEPRECATED: The `max_download_size` parameters for `Series.to_pandas()` "
"are deprecated and will be removed soon. Please use `Series.to_pandas_batches()`."
)
warnings.warn(msg, category=FutureWarning)
if sampling_method is not None or random_state is not None:
msg = bfe.format_message(
"DEPRECATED: The `sampling_method` and `random_state` parameters for "
"`Series.to_pandas()` are deprecated and will be removed soon. "
"Please use `Series.sample().to_pandas()` instead for sampling."
)
warnings.warn(msg, category=FutureWarning)

if dry_run:
dry_run_stats, dry_run_job = self._block._compute_dry_run(
Expand Down
20 changes: 1 addition & 19 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5203,9 +5203,7 @@ def test_query_complexity_repeated_subtrees(
# See: https://github.com/python/cpython/issues/112282
reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.",
)
def test_query_complexity_repeated_analytic(
scalars_df_index, scalars_pandas_df_index, with_multiquery_execution
):
def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_index):
bf_df = scalars_df_index[["int64_col", "int64_too"]]
pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]]
# Uses LAG analytic operator, each in a new SELECT
Expand All @@ -5217,22 +5215,6 @@ def test_query_complexity_repeated_analytic(
assert_pandas_df_equal(bf_result, pd_result)


def test_to_pandas_downsampling_option_override(session):
df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting")
download_size = 1

# limits only apply for allow_large_result=True
df = df.to_pandas(
max_download_size=download_size,
sampling_method="head",
allow_large_results=True,
)

total_memory_bytes = df.memory_usage(deep=True).sum()
total_memory_mb = total_memory_bytes / (1024 * 1024)
assert total_memory_mb == pytest.approx(download_size, rel=0.5)


def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created):
dataset_id = dataset_id_not_created
destination_table = f"{dataset_id}.scalars_df"
Expand Down
64 changes: 56 additions & 8 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,62 @@ def test_to_pandas_override_global_option(scalars_df_index):
assert scalars_df_index._query_job.destination.table_id == table_id


def test_to_pandas_downsampling_option_override(session):
df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting")
download_size = 1

with pytest.warns(
UserWarning, match="The data size .* exceeds the maximum download limit"
):
# limits only apply for allow_large_result=True
df = df.to_pandas(
max_download_size=download_size,
sampling_method="head",
allow_large_results=True,
)

total_memory_bytes = df.memory_usage(deep=True).sum()
total_memory_mb = total_memory_bytes / (1024 * 1024)
assert total_memory_mb == pytest.approx(download_size, rel=0.5)


@pytest.mark.parametrize(
("kwargs", "message"),
[
pytest.param(
{"sampling_method": "head"},
r"DEPRECATED[\S\s]*sampling_method[\S\s]*DataFrame.sample",
id="sampling_method",
),
pytest.param(
{"random_state": 10},
r"DEPRECATED[\S\s]*random_state[\S\s]*DataFrame.sample",
id="random_state",
),
pytest.param(
{"max_download_size": 10},
r"DEPRECATED[\S\s]*max_download_size[\S\s]*DataFrame.to_pandas_batches",
id="max_download_size",
),
],
)
def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message):
with pytest.warns(FutureWarning, match=message):
scalars_df_index.to_pandas(
# limits only apply for allow_large_result=True
allow_large_results=True,
**kwargs,
)


def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index):
bf_df = session.read_pandas(scalars_pandas_df_multi_index)

result = bf_df.to_pandas(dry_run=True)

assert len(result) == 14


def test_to_arrow_override_global_option(scalars_df_index):
# Direct call to_arrow uses global default setting (allow_large_results=True),
with bigframes.option_context("bigquery.allow_large_results", True):
Expand Down Expand Up @@ -813,11 +869,3 @@ def test_to_sql_query_named_index_excluded(
utils.assert_pandas_df_equal(
roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True
)


def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index):
bf_df = session.read_pandas(scalars_pandas_df_multi_index)

result = bf_df.to_pandas(dry_run=True)

assert len(result) == 14
8 changes: 0 additions & 8 deletions tests/system/small/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,11 +426,3 @@ def test_multiindex_repr_includes_all_names(session):
)
index = session.read_pandas(df).set_index(["A", "B"]).index
assert "names=['A', 'B']" in repr(index)


def test_to_pandas_dry_run(scalars_df_index):
index = scalars_df_index.index

result = index.to_pandas(dry_run=True)

assert len(result) == 14
8 changes: 8 additions & 0 deletions tests/system/small/test_index_io.py
Original file line number Diff line number Diff line change

Expand Up

@@ -30,6 +30,14 @@ def test_to_pandas_override_global_option(scalars_df_index):
assert bf_index._query_job.destination.table_id == table_id


def test_to_pandas_dry_run(scalars_df_index):
index = scalars_df_index.index

result = index.to_pandas(dry_run=True)

assert len(result) == 14


def test_to_numpy_override_global_option(scalars_df_index):
with bigframes.option_context("bigquery.allow_large_results", True):

Expand Down
31 changes: 31 additions & 0 deletions tests/system/small/test_series_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pytest

import bigframes
import bigframes.series


def test_to_pandas_override_global_option(scalars_df_index):
Expand All @@ -37,6 +38,36 @@ def test_to_pandas_override_global_option(scalars_df_index):
assert session._metrics.execution_count - execution_count == 1


@pytest.mark.parametrize(
("kwargs", "message"),
[
pytest.param(
{"sampling_method": "head"},
r"DEPRECATED[\S\s]*sampling_method[\S\s]*Series.sample",
id="sampling_method",
),
pytest.param(
{"random_state": 10},
r"DEPRECATED[\S\s]*random_state[\S\s]*Series.sample",
id="random_state",
),
pytest.param(
{"max_download_size": 10},
r"DEPRECATED[\S\s]*max_download_size[\S\s]*Series.to_pandas_batches",
id="max_download_size",
),
],
)
def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message):
s: bigframes.series.Series = scalars_df_index["int64_col"]
with pytest.warns(FutureWarning, match=message):
s.to_pandas(
# limits only apply for allow_large_result=True
allow_large_results=True,
**kwargs,
)


@pytest.mark.parametrize(
("page_size", "max_results", "allow_large_results"),
[
Expand Down