From a8cb2c3f63f302b0742c1da2d93106c6a5565799 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 26 Jun 2025 01:27:30 +0000 Subject: [PATCH] fix: Fix bug selecting column repeatedly --- bigframes/core/array_value.py | 25 ++++++++++++++++++++----- bigframes/core/blocks.py | 5 ++++- tests/system/small/test_dataframe.py | 9 +++++++++ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 4b05781cb7..b47637cb59 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -330,12 +330,27 @@ def create_constant( return self.project_to_id(ex.const(value, dtype)) - def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: + def select_columns( + self, column_ids: typing.Sequence[str], allow_renames: bool = False + ) -> ArrayValue: # This basically just drops and reorders columns - logically a no-op except as a final step - selections = ( - bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) - for col_id in column_ids - ) + selections = [] + seen = set() + + for id in column_ids: + if id not in seen: + ref = nodes.AliasedRef.identity(ids.ColumnId(id)) + elif allow_renames: + ref = nodes.AliasedRef( + ex.deref(id), ids.ColumnId(bigframes.core.guid.generate_guid()) + ) + else: + raise ValueError( + "Must set allow_renames=True to select columns repeatedly" + ) + selections.append(ref) + seen.add(id) + return ArrayValue( nodes.SelectionNode( child=self.node, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 675e8c8b7a..1426459912 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1210,7 +1210,10 @@ def select_column(self, id: str) -> Block: return self.select_columns([id]) def select_columns(self, ids: typing.Sequence[str]) -> Block: - expr = self._expr.select_columns([*self.index_columns, *ids]) + # Allow renames as may end up selecting same columns multiple times + expr = self._expr.select_columns( + [*self.index_columns, *ids], allow_renames=True + ) col_labels = self._get_labels_for_columns(ids) return Block(expr, self.index_columns, col_labels, self.index.names) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b037c6f371..d5446efcd0 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3408,6 +3408,15 @@ def test__dir__with_rename(scalars_dfs): assert "drop" in results +def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index[["int64_col", "int64_col", "int64_too"]].to_pandas() + pd_result = scalars_pandas_df_index[["int64_col", "int64_col", "int64_too"]] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + @pytest.mark.parametrize( ("start", "stop", "step"), [