From 423db293ab22f8a51c442f9e049f097add0755a0 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 14 Nov 2023 01:11:01 +0000 Subject: [PATCH 1/3] fix: make to_pandas override enable_downsampling when sampling_method is manually set. --- bigframes/core/blocks.py | 42 ++++++++++++++++------------ tests/system/small/test_dataframe.py | 11 ++++++++ 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 6358d28e2e..b0c30eb690 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -390,23 +390,6 @@ def to_pandas( ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" - if max_download_size is None: - max_download_size = bigframes.options.sampling.max_download_size - if sampling_method is None: - sampling_method = ( - bigframes.options.sampling.sampling_method - if bigframes.options.sampling.sampling_method is not None - else _UNIFORM - ) - if random_state is None: - random_state = bigframes.options.sampling.random_state - - sampling_method = sampling_method.lower() - if sampling_method not in _SAMPLING_METHODS: - raise NotImplementedError( - f"The downsampling method {sampling_method} is not implemented, " - f"please choose from {','.join(_SAMPLING_METHODS)}." - ) df, _, query_job = self._compute_and_count( value_keys=value_keys, @@ -454,6 +437,29 @@ def _compute_and_count( ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. + enable_downsampling = bigframes.options.sampling.enable_downsampling + if max_download_size is None: + max_download_size = bigframes.options.sampling.max_download_size + + if sampling_method is None: + sampling_method = ( + bigframes.options.sampling.sampling_method + if bigframes.options.sampling.sampling_method is not None + else _UNIFORM + ) + else: + enable_downsampling = True + + if random_state is None: + random_state = bigframes.options.sampling.random_state + + sampling_method = sampling_method.lower() + if sampling_method not in _SAMPLING_METHODS: + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) + expr = self._apply_value_keys_to_expr(value_keys=value_keys) results_iterator, query_job = expr.start_query( @@ -470,7 +476,7 @@ def _compute_and_count( ) if fraction < 1: - if not bigframes.options.sampling.enable_downsampling: + if not enable_downsampling: raise RuntimeError( f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of " f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n" diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e522878229..124ef612b9 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3515,3 +3515,14 @@ def test_df_dot_operator_series( bf_result, pd_result, ) + + +def test_to_pandas_downsampling_option_override(session): + df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") + download_size = 1 + + df = df.to_pandas(max_download_size=download_size, sampling_method="head") + + total_memory_bytes = df.memory_usage(deep=True).sum() + total_memory_mb = total_memory_bytes / (1024 * 1024) + assert total_memory_mb == pytest.approx(download_size, rel=0.3) From d28a178bb56a37c285be60440bf9617061ceccfd Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 14 Nov 2023 01:11:01 +0000 Subject: [PATCH 2/3] fix: make to_pandas override enable_downsampling when sampling_method is manually set. --- bigframes/core/blocks.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index b0c30eb690..17d748c8c3 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -437,9 +437,23 @@ def _compute_and_count( ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. - enable_downsampling = bigframes.options.sampling.enable_downsampling - if max_download_size is None: - max_download_size = bigframes.options.sampling.max_download_size + enable_downsampling = ( + True + if sampling_method is not None + else bigframes.options.sampling.enable_downsampling + ) + + max_download_size = ( + max_download_size + if max_download_size is not None + else bigframes.options.sampling.max_download_size + ) + + random_state = ( + random_state + if random_state is not None + else bigframes.options.sampling.random_state + ) if sampling_method is None: sampling_method = ( @@ -447,13 +461,8 @@ def _compute_and_count( if bigframes.options.sampling.sampling_method is not None else _UNIFORM ) - else: - enable_downsampling = True - - if random_state is None: - random_state = bigframes.options.sampling.random_state - sampling_method = sampling_method.lower() + if sampling_method not in _SAMPLING_METHODS: raise NotImplementedError( f"The downsampling method {sampling_method} is not implemented, " From ced1e4e21abd91d2a3785a0f42c459e345ff2c10 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 14 Nov 2023 01:11:01 +0000 Subject: [PATCH 3/3] fix: make to_pandas override enable_downsampling when sampling_method is manually set. --- bigframes/core/blocks.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 17d748c8c3..e8fc9ea135 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -444,23 +444,13 @@ def _compute_and_count( ) max_download_size = ( - max_download_size - if max_download_size is not None - else bigframes.options.sampling.max_download_size + max_download_size or bigframes.options.sampling.max_download_size ) - random_state = ( - random_state - if random_state is not None - else bigframes.options.sampling.random_state - ) + random_state = random_state or bigframes.options.sampling.random_state if sampling_method is None: - sampling_method = ( - bigframes.options.sampling.sampling_method - if bigframes.options.sampling.sampling_method is not None - else _UNIFORM - ) + sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM sampling_method = sampling_method.lower() if sampling_method not in _SAMPLING_METHODS: