From 351fba1faa610c2b7a90e50acdc74909eea30be8 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 24 Sep 2024 17:25:25 +0000 Subject: [PATCH] fix: Fix __repr__ caching with partial ordering --- bigframes/core/tree_properties.py | 4 ++-- bigframes/dataframe.py | 1 - bigframes/session/executor.py | 5 ----- tests/system/conftest.py | 4 ++-- tests/system/small/test_dataframe.py | 24 ++++++++++++++++++++++++ 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py index 4978e75e38..1b0fe0d072 100644 --- a/bigframes/core/tree_properties.py +++ b/bigframes/core/tree_properties.py @@ -44,8 +44,8 @@ def can_fast_head(node: nodes.BigFrameNode) -> bool: """Can get head fast if can push head operator down to leafs and operators preserve rows.""" if isinstance(node, nodes.LeafNode): return node.supports_fast_head - if isinstance(node, nodes.UnaryNode): - return node.row_preserving and can_fast_head(node.child) + if isinstance(node, (nodes.ProjectionNode, nodes.SelectionNode)): + return can_fast_head(node.child) return False diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 49a668f008..a59f6bf941 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -643,7 +643,6 @@ def __repr__(self) -> str: if opts.repr_mode == "deferred": return formatter.repr_query_job(self._compute_dry_run()) - self._cached() # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index d2a2e0f1b2..f89b5aefec 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -360,11 +360,6 @@ def _cache_with_cluster_cols( def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): """Executes the query and uses the resulting table to rewrite future executions.""" - - if not self.strictly_ordered: - raise ValueError( - "Caching with offsets only supported in strictly ordered mode." - ) offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") w_offsets, offset_column = array_value.promote_offsets() sql = self.compiler.compile_unordered(self._get_optimized_plan(w_offsets.node)) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 5ee2dc6397..d9246eecfb 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -154,9 +154,9 @@ def session_load() -> Generator[bigframes.Session, None, None]: session.close() # close generated session at cleanup time -@pytest.fixture(scope="session", params=["ordered", "unordered"]) +@pytest.fixture(scope="session", params=["strict", "partial"]) def maybe_ordered_session(request) -> Generator[bigframes.Session, None, None]: - context = bigframes.BigQueryOptions(location="US", ordering_mode="partial") + context = bigframes.BigQueryOptions(location="US", ordering_mode=request.param) session = bigframes.Session(context=context) yield session session.close() # close generated session at cleanup type diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 0a637e983f..340df93791 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -567,6 +567,30 @@ def test_repr_w_all_rows(scalars_dfs): assert actual == expected +def test_join_repr(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + scalars_df = ( + scalars_df[["int64_col"]] + .join(scalars_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + scalars_pandas_df = ( + scalars_pandas_df[["int64_col"]] + .join(scalars_pandas_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + # Pandas join result index name seems to depend on the index values in a way that bigframes can't match exactly + scalars_pandas_df.index.name = None + + actual = repr(scalars_df) + + with display_options.pandas_repr(bigframes.options.display): + expected = repr(scalars_pandas_df) + + assert actual == expected + + def test_repr_html_w_all_rows(scalars_dfs): scalars_df, _ = scalars_dfs # get a pandas df of the expected format