From b83c4fbc7ea9d646afbc684ae938540719d4ddd0 Mon Sep 17 00:00:00 2001 From: Duc Le Tu Date: Mon, 29 Jan 2024 17:36:29 +0000 Subject: [PATCH 1/4] fix merge df same source after filter --- bigframes/core/compile/single_column.py | 4 ++ tests/system/conftest.py | 7 +++ tests/system/small/test_pandas_etc.py | 71 +++++++++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 tests/system/small/test_pandas_etc.py diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index d26e71d1b4..895ac8b90f 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -64,6 +64,8 @@ def join_by_column_ordered( .equals(right._get_ibis_column(rcol).name("index")) for lcol, rcol in join.conditions ) + and not left._predicates + and not right._predicates ): return bigframes.core.compile.row_identity.join_by_row_identity_ordered( left, right, join_def=join @@ -165,6 +167,8 @@ def join_by_column_unordered( .equals(right._get_ibis_column(rcol).name("index")) for lcol, rcol in join.conditions ) + and not left._predicates + and not right._predicates ): return bigframes.core.compile.row_identity.join_by_row_identity_unordered( left, right, join_def=join diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 0ad4280497..97057d6f47 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -280,6 +280,13 @@ def scalars_table_id(test_data_tables) -> str: return test_data_tables["scalars"] +@pytest.fixture(scope="session") +def baseball_schedules_df(session: bigframes.Session) -> bigframes.dataframe.DataFrame: + """Public BQ table""" + df = session.read_gbq("bigquery-public-data.baseball.schedules") + return df + + @pytest.fixture(scope="session") def hockey_table_id(test_data_tables) -> str: return test_data_tables["hockey_players"] diff --git a/tests/system/small/test_pandas_etc.py b/tests/system/small/test_pandas_etc.py new file mode 100644 index 0000000000..b68fd67c6c --- /dev/null +++ b/tests/system/small/test_pandas_etc.py @@ -0,0 +1,71 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +import bigframes.pandas as bpd +from tests.system.utils import assert_pandas_df_equal + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_after_filter(baseball_schedules_df, merge_how): + on = ["awayTeamName"] + left_columns = [ + "gameId", + "year", + "homeTeamName", + "awayTeamName", + "duration_minutes", + ] + right_columns = [ + "gameId", + "year", + "homeTeamName", + "awayTeamName", + "duration_minutes", + ] + + left = baseball_schedules_df[left_columns] + left = left[left["homeTeamName"] == "Rays"] + # Offset the rows somewhat so that outer join can have an effect. + right = baseball_schedules_df[right_columns] + right = right[right["homeTeamName"] == "White Sox"] + + df = left.merge(right, on=on, how=merge_how) + bf_result = df.to_pandas() + + left_pandas = baseball_schedules_df.to_pandas()[left_columns] + left_pandas = left_pandas[left_pandas["homeTeamName"] == "Rays"] + + right_pandas = baseball_schedules_df.to_pandas()[right_columns] + right_pandas = right_pandas[right_pandas["homeTeamName"] == "White Sox"] + + pd_result = pd.merge( + left_pandas, + right_pandas, + merge_how, + on, + sort=True, + ) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) From 8fc2b6a6901cb7457aafa51460d3f6fdd05a84fe Mon Sep 17 00:00:00 2001 From: Duc Le Tu Date: Mon, 19 Feb 2024 15:12:51 +0700 Subject: [PATCH 2/4] Update test_pandas_etc.py --- tests/system/small/test_pandas_etc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system/small/test_pandas_etc.py b/tests/system/small/test_pandas_etc.py index b68fd67c6c..737baad688 100644 --- a/tests/system/small/test_pandas_etc.py +++ b/tests/system/small/test_pandas_etc.py @@ -15,7 +15,6 @@ import pandas as pd import pytest -import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal From 62ff7798d29130a721cb80386383eb086cd22d40 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 7 Mar 2024 16:09:40 +0000 Subject: [PATCH 3/4] default row identity joins to false --- bigframes/core/__init__.py | 2 +- bigframes/core/compile/row_identity.py | 10 +++++++--- bigframes/core/compile/single_column.py | 8 ++------ bigframes/core/nodes.py | 2 +- .../test_issue355_merge_after_filter.py} | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) rename tests/system/small/{test_pandas_etc.py => regression/test_issue355_merge_after_filter.py} (98%) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 4dc2e4d7af..9032993452 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -349,7 +349,7 @@ def join( self, other: ArrayValue, join_def: join_def.JoinDefinition, - allow_row_identity_join: bool = True, + allow_row_identity_join: bool = False, ): return ArrayValue( nodes.JoinNode( diff --git a/bigframes/core/compile/row_identity.py b/bigframes/core/compile/row_identity.py index f46e2f9463..a376b59b06 100644 --- a/bigframes/core/compile/row_identity.py +++ b/bigframes/core/compile/row_identity.py @@ -28,7 +28,7 @@ import bigframes.core.joins as joining import bigframes.core.ordering as orderings -SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} +SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "right", "inner"} def join_by_row_identity_unordered( @@ -39,7 +39,8 @@ def join_by_row_identity_unordered( """Compute join when we are joining by row identity not a specific column.""" if join_def.type not in SUPPORTED_ROW_IDENTITY_HOW: raise NotImplementedError( - f"Only how='outer','left','inner' currently supported. {constants.FEEDBACK_LINK}" + f"Only how={repr(SUPPORTED_ROW_IDENTITY_HOW)} currently supported. " + + constants.FEEDBACK_LINK ) if not left._table.equals(right._table): @@ -98,7 +99,8 @@ def join_by_row_identity_ordered( """Compute join when we are joining by row identity not a specific column.""" if join_def.type not in SUPPORTED_ROW_IDENTITY_HOW: raise NotImplementedError( - f"Only how='outer','left','inner' currently supported. {constants.FEEDBACK_LINK}" + f"Only how={repr(SUPPORTED_ROW_IDENTITY_HOW)} currently supported. " + + constants.FEEDBACK_LINK ) if not left._table.equals(right._table): @@ -225,6 +227,8 @@ def _join_predicates( return (joined_predicates,) if join_type == "left": return tuple(left_predicates) + if join_type == "right": + return tuple(right_predicates) if join_type == "inner": _, right_relative_predicates = _get_relative_predicates( left_predicates, right_predicates diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index 895ac8b90f..7beebfcb66 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -33,7 +33,7 @@ def join_by_column_ordered( left: compiled.OrderedIR, right: compiled.OrderedIR, join: join_defs.JoinDefinition, - allow_row_identity_join: bool = True, + allow_row_identity_join: bool = False, ) -> compiled.OrderedIR: """Join two expressions by column equality. @@ -64,8 +64,6 @@ def join_by_column_ordered( .equals(right._get_ibis_column(rcol).name("index")) for lcol, rcol in join.conditions ) - and not left._predicates - and not right._predicates ): return bigframes.core.compile.row_identity.join_by_row_identity_ordered( left, right, join_def=join @@ -136,7 +134,7 @@ def join_by_column_unordered( left: compiled.UnorderedIR, right: compiled.UnorderedIR, join: join_defs.JoinDefinition, - allow_row_identity_join: bool = True, + allow_row_identity_join: bool = False, ) -> compiled.UnorderedIR: """Join two expressions by column equality. @@ -167,8 +165,6 @@ def join_by_column_unordered( .equals(right._get_ibis_column(rcol).name("index")) for lcol, rcol in join.conditions ) - and not left._predicates - and not right._predicates ): return bigframes.core.compile.row_identity.join_by_row_identity_unordered( left, right, join_def=join diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index f637177a94..1cd3277cbc 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -115,7 +115,7 @@ class JoinNode(BigFrameNode): left_child: BigFrameNode right_child: BigFrameNode join: JoinDefinition - allow_row_identity_join: bool = True + allow_row_identity_join: bool = False @property def row_preserving(self) -> bool: diff --git a/tests/system/small/test_pandas_etc.py b/tests/system/small/regression/test_issue355_merge_after_filter.py similarity index 98% rename from tests/system/small/test_pandas_etc.py rename to tests/system/small/regression/test_issue355_merge_after_filter.py index 737baad688..24ee01cb7f 100644 --- a/tests/system/small/test_pandas_etc.py +++ b/tests/system/small/regression/test_issue355_merge_after_filter.py @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From c1810eea3cdae396b8bc716689a2f0f57496786b Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 7 Mar 2024 16:12:22 +0000 Subject: [PATCH 4/4] revert row identity join changes --- bigframes/core/compile/row_identity.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/bigframes/core/compile/row_identity.py b/bigframes/core/compile/row_identity.py index a376b59b06..f46e2f9463 100644 --- a/bigframes/core/compile/row_identity.py +++ b/bigframes/core/compile/row_identity.py @@ -28,7 +28,7 @@ import bigframes.core.joins as joining import bigframes.core.ordering as orderings -SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "right", "inner"} +SUPPORTED_ROW_IDENTITY_HOW = {"outer", "left", "inner"} def join_by_row_identity_unordered( @@ -39,8 +39,7 @@ def join_by_row_identity_unordered( """Compute join when we are joining by row identity not a specific column.""" if join_def.type not in SUPPORTED_ROW_IDENTITY_HOW: raise NotImplementedError( - f"Only how={repr(SUPPORTED_ROW_IDENTITY_HOW)} currently supported. " - + constants.FEEDBACK_LINK + f"Only how='outer','left','inner' currently supported. {constants.FEEDBACK_LINK}" ) if not left._table.equals(right._table): @@ -99,8 +98,7 @@ def join_by_row_identity_ordered( """Compute join when we are joining by row identity not a specific column.""" if join_def.type not in SUPPORTED_ROW_IDENTITY_HOW: raise NotImplementedError( - f"Only how={repr(SUPPORTED_ROW_IDENTITY_HOW)} currently supported. " - + constants.FEEDBACK_LINK + f"Only how='outer','left','inner' currently supported. {constants.FEEDBACK_LINK}" ) if not left._table.equals(right._table): @@ -227,8 +225,6 @@ def _join_predicates( return (joined_predicates,) if join_type == "left": return tuple(left_predicates) - if join_type == "right": - return tuple(right_predicates) if join_type == "inner": _, right_relative_predicates = _get_relative_predicates( left_predicates, right_predicates