From a7089d527ee43f4898f8d5b34a703972eb8efe4b Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 3 May 2024 17:36:57 +0000 Subject: [PATCH] fix: downgrade NoDefaultIndexError to DefaultIndexWarning --- bigframes/exceptions.py | 4 +- .../session/_io/bigquery/read_gbq_table.py | 7 ++-- tests/unit/session/test_session.py | 39 +++++++++++-------- .../bigframes_vendored/pandas/io/gbq.py | 2 +- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 222df069f6..5caf2aa1df 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -25,5 +25,5 @@ class CleanupFailedWarning(Warning): """Bigframes failed to clean up a table resource.""" -class NoDefaultIndexError(ValueError): - """Unable to create a default index.""" +class DefaultIndexWarning(Warning): + """Default index may cause unexpected costs.""" diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 29d5a5567f..f6c1463e6c 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -277,13 +277,14 @@ def get_index_cols_and_uniqueness( # resource utilization because of the default sequential index. See # internal issue 335727141. if _is_table_clustered_or_partitioned(table) and not primary_keys: - raise bigframes.exceptions.NoDefaultIndexError( + warnings.warn( f"Table '{str(table.reference)}' is clustered and/or " "partitioned, but BigQuery DataFrames was not able to find a " - "suitable index. To avoid this error, set at least one of: " + "suitable index. To avoid this warning, set at least one of: " # TODO(b/338037499): Allow max_results to override this too, # once we make it more efficient. - "`index_col` or `filters`." + "`index_col` or `filters`.", + category=bigframes.exceptions.DefaultIndexWarning, ) # If there are primary keys defined, the query engine assumes these diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 70a121435c..a161c2df76 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -17,6 +17,7 @@ import os import re from unittest import mock +import warnings import google.api_core.exceptions import google.cloud.bigquery @@ -186,7 +187,7 @@ def get_table_mock(table_ref): @pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) -def test_no_default_index_error_raised_by_read_gbq(table): +def test_default_index_warning_raised_by_read_gbq(table): """Because of the windowing operation to create a default index, row filters can't push down to the clustering column. @@ -202,12 +203,12 @@ def test_no_default_index_error_raised_by_read_gbq(table): session = resources.create_bigquery_session(bqclient=bqclient) table._properties["location"] = session._location - with pytest.raises(bigframes.exceptions.NoDefaultIndexError): + with pytest.warns(bigframes.exceptions.DefaultIndexWarning): session.read_gbq("my-project.my_dataset.my_table") @pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) -def test_no_default_index_error_not_raised_by_read_gbq_index_col_sequential_int64( +def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64( table, ): """Because of the windowing operation to create a default index, row @@ -224,11 +225,13 @@ def test_no_default_index_error_not_raised_by_read_gbq_index_col_sequential_int6 session = resources.create_bigquery_session(bqclient=bqclient) table._properties["location"] = session._location - # No exception raised because we set the option allowing the default indexes. - df = session.read_gbq( - "my-project.my_dataset.my_table", - index_col=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, - ) + # No warnings raised because we set the option allowing the default indexes. + with warnings.catch_warnings(): + warnings.simplefilter("error", bigframes.exceptions.DefaultIndexWarning) + df = session.read_gbq( + "my-project.my_dataset.my_table", + index_col=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, + ) # We expect a window operation because we specificaly requested a sequential index. generated_sql = df.sql.casefold() @@ -246,7 +249,7 @@ def test_no_default_index_error_not_raised_by_read_gbq_index_col_sequential_int6 ), ) @pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) -def test_no_default_index_error_not_raised_by_read_gbq_index_col_columns( +def test_default_index_warning_not_raised_by_read_gbq_index_col_columns( total_count, distinct_count, table, @@ -270,10 +273,12 @@ def test_no_default_index_error_not_raised_by_read_gbq_index_col_columns( ) table._properties["location"] = session._location - # No exception raised because there are columns to use as the index. - df = session.read_gbq( - "my-project.my_dataset.my_table", index_col=("idx_1", "idx_2") - ) + # No warning raised because there are columns to use as the index. + with warnings.catch_warnings(): + warnings.simplefilter("error", bigframes.exceptions.DefaultIndexWarning) + df = session.read_gbq( + "my-project.my_dataset.my_table", index_col=("idx_1", "idx_2") + ) # There should be no analytic operators to prevent row filtering pushdown. assert "OVER" not in df.sql @@ -281,7 +286,7 @@ def test_no_default_index_error_not_raised_by_read_gbq_index_col_columns( @pytest.mark.parametrize("table", CLUSTERED_OR_PARTITIONED_TABLES) -def test_no_default_index_error_not_raised_by_read_gbq_primary_key(table): +def test_default_index_warning_not_raised_by_read_gbq_primary_key(table): """If a primary key is set on the table, we use that as the index column by default, no error should be raised in this case. @@ -310,8 +315,10 @@ def test_no_default_index_error_not_raised_by_read_gbq_primary_key(table): ) table._properties["location"] = session._location - # No exception raised because there is a primary key to use as the index. - df = session.read_gbq("my-project.my_dataset.my_table") + # No warning raised because there is a primary key to use as the index. + with warnings.catch_warnings(): + warnings.simplefilter("error", bigframes.exceptions.DefaultIndexWarning) + df = session.read_gbq("my-project.my_dataset.my_table") # There should be no analytic operators to prevent row filtering pushdown. assert "OVER" not in df.sql diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index c25dd8776f..38ea208eaf 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -157,7 +157,7 @@ def read_gbq( Alias for columns, retained for backwards compatibility. Raises: - bigframes.exceptions.NoDefaultIndexError: + bigframes.exceptions.DefaultIndexWarning: Using the default index is discouraged, such as with clustered or partitioned tables without primary keys.