8000 feat: Add `__contains__` to Index, Series, DataFrame by TrevorBergeron · Pull Request #1899 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@

from __future__ import annotations

import functools
import typing
from typing import Hashable, Literal, Optional, overload, Sequence, Union
from typing import cast, Hashable, Literal, Optional, overload, Sequence, Union

import bigframes_vendored.constants as constants
import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index
Expand Down Expand Up @@ -529,6 +530,29 @@ def isin(self, values) -> Index:
)
).fillna(value=False)

def __contains__(self, key) -> bool:
hash(key) # to throw for unhashable values
if self.nlevels == 0:
return False

if (not isinstance(key, tuple)) or (self.nlevels == 1):
key = (key,)

match_exprs = []
for key_part, index_col, dtype in zip(
key, self._block.index_columns, self._block.index.dtypes
):
key_type = bigframes.dtypes.is_compatible(key_part, dtype)
if key_type is None:
return False
key_expr = ex.const(key_part, key_type)
match_expr = ops.eq_null_match_op.as_expr(ex.deref(index_col), key_expr)
match_exprs.append(match_expr)

match_expr_final = functools.reduce(ops.and_op.as_expr, match_exprs)
block, match_col = self._block.project_expr(match_expr_final)
return cast(bool, block.get_stat(match_col, agg_ops.AnyOp()))

def _apply_unary_expr(
self,
op: ex.Expression,
Expand Down
3 changes: 3 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,9 @@ def __len__(self):
def __iter__(self):
return iter(self.columns)

def __contains__(self, key) -> bool:
return key in self.columns

def astype(
self,
dtype: Union[
Expand Down
3 changes: 3 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,9 @@ def __iter__(self) -> typing.Iterator:
map(lambda x: x.squeeze(axis=1), self._block.to_pandas_batches())
)

def __contains__(self, key) -> bool:
return key in self.index

def copy(self) -> Series:
return Series(self._block)

Expand Down
16 changes: 16 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4451,6 +4451,22 @@ def test_df___array__(scalars_df_index, scalars_pandas_df_index):
)


@pytest.mark.parametrize(
("key",),
[
("hello",),
(2,),
("int64_col",),
(None,),
],
)
def test_df_contains(scalars_df_index, scalars_pandas_df_index, key):
bf_result = key in scalars_df_index
pd_result = key in scalars_pandas_df_index

assert bf_result == pd_result


def test_df_getattr_attribute_error_when_pandas_has(scalars_df_index):
# swapaxes is implemented in pandas but not in bigframes
with pytest.raises(AttributeError):
Expand Down
12 changes: 12 additions & 0 deletions tests/system/small/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,18 @@ def test_index_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep):
)


@pytest.mark.parametrize(
("key",),
[("hello",), (2,), (123123321,), (2.0,), (False,), ((2,),), (pd.NA,)],
)
def test_index_contains(scalars_df_index, scalars_pandas_df_index, key):
col_name = "int64_col"
bf_result = key in scalars_df_index.set_index(col_name).index
pd_result = key in scalars_pandas_df_index.set_index(col_name).index

assert bf_result == pd_result


def test_index_isin_list(scalars_df_index, scalars_pandas_df_index):
col_name = "int64_col"
bf_series = (
Expand Down
23 changes: 23 additions & 0 deletions tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1388,3 +1388,26 @@ def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index
# Pandas produces pd.NA, where bq dataframes produces NaN
pd_result["c"] = pd_result["c"].replace(pandas.NA, np.nan)
pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)


@pytest.mark.parametrize(
("key",),
[
("hello",),
(2,),
(123123321,),
(2.0,),
(pandas.NA,),
(False,),
((2,),),
((2, False),),
((2.0, False),),
((2, True),),
],
)
def test_multi_index_contains(scalars_df_index, scalars_pandas_df_index, key):
col_name = ["int64_col", "bool_col"]
bf_result = key in scalars_df_index.set_index(col_name).index
pd_result = key in scalars_pandas_df_index.set_index(col_name).index

assert bf_result == pd_result
4 changes: 4 additions & 0 deletions tests/system/small/test_null_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,3 +396,7 @@ def test_null_index_index_property(scalars_df_null_index):
def test_null_index_transpose(scalars_df_null_index):
with pytest.raises(bigframes.exceptions.NullIndexError):
_ = scalars_df_null_index.T


def test_null_index_contains(scalars_df_null_index):
assert 3 not in scalars_df_null_index
16 changes: 16 additions & 0 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,22 @@ def test_series_get_column_default(scalars_dfs):
assert result == "default_val"


@pytest.mark.parametrize(
("key",),
[
("hello",),
(2,),
("int64_col",),
(None,),
],
)
def test_series_contains(scalars_df_index, scalars_pandas_df_index, key):
bf_result = key in scalars_df_index["int64_col"]
pd_result = key in scalars_pandas_df_index["int64_col"]

assert bf_result == pd_result


def test_series_equals_identical(scalars_df_index, scalars_pandas_df_index):
bf_result = scalars_df_index.int64_col.equals(scalars_df_index.int64_col)
pd_result = scalars_pandas_df_index.int64_col.equals(
Expand Down
0