diff --git a/CHANGELOG.md b/CHANGELOG.md index 63bf7fd0ec..60fe9ae5e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,24 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.27.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.26.0...v2.27.0) (2025-10-24) + + +### Features + +* Add __abs__ to dataframe ([#2186](https://github.com/googleapis/python-bigquery-dataframes/issues/2186)) ([c331dfe](https://github.com/googleapis/python-bigquery-dataframes/commit/c331dfed59174962fbdc8ace175dd00fcc3d5d50)) +* Add df.groupby().corr()/cov() support ([#2190](https://github.com/googleapis/python-bigquery-dataframes/issues/2190)) ([ccd7c07](https://github.com/googleapis/python-bigquery-dataframes/commit/ccd7c0774a65d09e6cf31d2b62d0bc64bd7c4248)) +* Add str accessor to index ([#2179](https://github.com/googleapis/python-bigquery-dataframes/issues/2179)) ([cd87ce0](https://github.com/googleapis/python-bigquery-dataframes/commit/cd87ce0d504747f44d1b5a55f869a2e0fca6df17)) +* Add support for `np.isnan` and `np.isfinite` ufuncs ([#2188](https://github.com/googleapis/python-bigquery-dataframes/issues/2188)) ([68723bc](https://github.com/googleapis/python-bigquery-dataframes/commit/68723bc1f08013e43a8b11752f908bf8fd6d51f5)) +* Include local data bytes in the dry run report when available ([#2185](https://github.com/googleapis/python-bigquery-dataframes/issues/2185)) ([ee2c40c](https://github.com/googleapis/python-bigquery-dataframes/commit/ee2c40c6789535e259fb6a9774831d6913d16212)) +* Support len() on Groupby objects ([#2183](https://github.com/googleapis/python-bigquery-dataframes/issues/2183)) ([4191821](https://github.com/googleapis/python-bigquery-dataframes/commit/4191821b0976281a96c8965336ef51f061b0c481)) +* Support pa.json_(pa.string()) in struct/list if available ([#2180](https://github.com/googleapis/python-bigquery-dataframes/issues/2180)) ([5ec3cc0](https://github.com/googleapis/python-bigquery-dataframes/commit/5ec3cc0298c7a6195d5bd12a08d996e7df57fc5f)) + + +### Documentation + +* Update AI operators deprecation notice ([#2182](https://github.com/googleapis/python-bigquery-dataframes/issues/2182)) ([2c50310](https://github.com/googleapis/python-bigquery-dataframes/commit/2c503107e17c59232b14b0d7bc40c350bb087d6f)) + ## [2.26.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.25.0...v2.26.0) (2025-10-17) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 166841dfbd..1900b7208a 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -967,7 +967,7 @@ def _compute_dry_run( } dry_run_stats = dry_runs.get_query_stats_with_dtypes( - query_job, column_dtypes, self.index.dtypes + query_job, column_dtypes, self.index.dtypes, self.expr.node ) return dry_run_stats, query_job diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_compiler.py b/bigframes/core/compile/ibis_compiler/scalar_op_compiler.py index 1197f6b9da..8a027ca296 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_compiler.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_compiler.py @@ -26,6 +26,7 @@ from bigframes.core import agg_expressions, ordering import bigframes.core.compile.ibis_types import bigframes.core.expression as ex +from bigframes.operations import numeric_ops if TYPE_CHECKING: import bigframes.operations as ops @@ -267,3 +268,13 @@ def _convert_range_ordering_to_table_value( # Singleton compiler scalar_op_compiler = ExpressionCompiler() + + +@scalar_op_compiler.register_unary_op(numeric_ops.isnan_op) +def isnanornull(arg): + return arg.isnan() + + +@scalar_op_compiler.register_unary_op(numeric_ops.isfinite_op) +def isfinite(arg): + return arg.isinf().negate() & arg.isnan().negate() diff --git a/bigframes/core/compile/polars/operations/numeric_ops.py b/bigframes/core/compile/polars/operations/numeric_ops.py index 2572d862e3..8e44f15955 100644 --- a/bigframes/core/compile/polars/operations/numeric_ops.py +++ b/bigframes/core/compile/polars/operations/numeric_ops.py @@ -89,3 +89,21 @@ def sqrt_op_impl( import polars as pl return pl.when(input < 0).then(float("nan")).otherwise(input.sqrt()) + + +@polars_compiler.register_op(numeric_ops.IsNanOp) +def is_nan_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: numeric_ops.IsNanOp, # type: ignore + input: pl.Expr, +) -> pl.Expr: + return input.is_nan() + + +@polars_compiler.register_op(numeric_ops.IsFiniteOp) +def is_finite_op_impl( + compiler: polars_compiler.PolarsExpressionCompiler, + op: numeric_ops.IsFiniteOp, # type: ignore + input: pl.Expr, +) -> pl.Expr: + return input.is_finite() diff --git a/bigframes/core/compile/sqlglot/expressions/generic_ops.py b/bigframes/core/compile/sqlglot/expressions/generic_ops.py index 60366b02c9..9782ef11d4 100644 --- a/bigframes/core/compile/sqlglot/expressions/generic_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/generic_ops.py @@ -76,8 +76,8 @@ def _(expr: TypedExpr) -> sge.Expression: @register_unary_op(ops.invert_op) def _(expr: TypedExpr) -> sge.Expression: if expr.dtype == dtypes.BOOL_DTYPE: - return sge.Not(this=expr.expr) - return sge.BitwiseNot(this=expr.expr) + return sge.Not(this=sge.paren(expr.expr)) + return sge.BitwiseNot(this=sge.paren(expr.expr)) @register_unary_op(ops.isnull_op) diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index 3bbe2623ea..8ca884b900 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -22,6 +22,7 @@ import bigframes.core.compile.sqlglot.expressions.constants as constants from bigframes.core.compile.sqlglot.expressions.typed_expr import TypedExpr import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler +from bigframes.operations import numeric_ops register_unary_op = scalar_compiler.scalar_op_compiler.register_unary_op register_binary_op = scalar_compiler.scalar_op_compiler.register_binary_op @@ -189,7 +190,7 @@ def _(expr: TypedExpr) -> sge.Expression: @register_unary_op(ops.neg_op) def _(expr: TypedExpr) -> sge.Expression: - return sge.Neg(this=expr.expr) + return sge.Neg(this=sge.paren(expr.expr)) @register_unary_op(ops.pos_op) @@ -408,6 +409,21 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: ) +@register_unary_op(numeric_ops.isnan_op) +def isnan(arg: TypedExpr) -> sge.Expression: + return sge.IsNan(this=arg.expr) + + +@register_unary_op(numeric_ops.isfinite_op) +def isfinite(arg: TypedExpr) -> sge.Expression: + return sge.Not( + this=sge.Or( + this=sge.IsInf(this=arg.expr), + right=sge.IsNan(this=arg.expr), + ), + ) + + def _coerce_bool_to_int(typed_expr: TypedExpr) -> sge.Expression: """Coerce boolean expression to integer.""" if typed_expr.dtype == dtypes.BOOL_DTYPE: diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py index 40e96f6f42..3948d08a23 100644 --- a/bigframes/core/groupby/dataframe_group_by.py +++ b/bigframes/core/groupby/dataframe_group_by.py @@ -177,6 +177,9 @@ def __iter__(self) -> Iterable[Tuple[blocks.Label, df.DataFrame]]: filtered_df = df.DataFrame(filtered_block) yield group_keys, filtered_df + def __len__(self) -> int: + return len(self.agg([])) + def size(self) -> typing.Union[df.DataFrame, series.Series]: agg_block, _ = self._block.aggregate_size( by_column_ids=self._by_col_ids, @@ -275,6 +278,76 @@ def var( self._raise_on_non_numeric("var") return self._aggregate_all(agg_ops.var_op, numeric_only=True) + def corr( + self, + *, + numeric_only: bool = False, + ) -> df.DataFrame: + if not numeric_only: + self._raise_on_non_numeric("corr") + if len(self._selected_cols) > 30: + raise ValueError( + f"Cannot calculate corr on >30 columns, dataframe has {len(self._selected_cols)} selected columns." + ) + + labels = self._block._get_labels_for_columns(self._selected_cols) + block = self._block + aggregations = [ + agg_expressions.BinaryAggregation( + agg_ops.CorrOp(), ex.deref(left_col), ex.deref(right_col) + ) + for left_col in self._selected_cols + for right_col in self._selected_cols + ] + # unique columns stops + uniq_orig_columns = utils.combine_indices(labels, pd.Index(range(len(labels)))) + result_labels = utils.cross_indices(uniq_orig_columns, uniq_orig_columns) + + block, _ = block.aggregate( + by_column_ids=self._by_col_ids, + aggregations=aggregations, + column_labels=result_labels, + ) + + block = block.stack(levels=labels.nlevels + 1) + # Drop the last level of each index, which was created to guarantee uniqueness + return df.DataFrame(block).droplevel(-1, axis=0).droplevel(-1, axis=1) + + def cov( + self, + *, + numeric_only: bool = False, + ) -> df.DataFrame: + if not numeric_only: + self._raise_on_non_numeric("cov") + if len(self._selected_cols) > 30: + raise ValueError( + f"Cannot calculate cov on >30 columns, dataframe has {len(self._selected_cols)} selected columns." + ) + + labels = self._block._get_labels_for_columns(self._selected_cols) + block = self._block + aggregations = [ + agg_expressions.BinaryAggregation( + agg_ops.CovOp(), ex.deref(left_col), ex.deref(right_col) + ) + for left_col in self._selected_cols + for right_col in self._selected_cols + ] + # unique columns stops + uniq_orig_columns = utils.combine_indices(labels, pd.Index(range(len(labels)))) + result_labels = utils.cross_indices(uniq_orig_columns, uniq_orig_columns) + + block, _ = block.aggregate( + by_column_ids=self._by_col_ids, + aggregations=aggregations, + column_labels=result_labels, + ) + + block = block.stack(levels=labels.nlevels + 1) + # Drop the last level of each index, which was created to guarantee uniqueness + return df.DataFrame(block).droplevel(-1, axis=0).droplevel(-1, axis=1) + def skew( self, *, diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py index 1f2632078d..b09b63dcfe 100644 --- a/bigframes/core/groupby/series_group_by.py +++ b/bigframes/core/groupby/series_group_by.py @@ -108,6 +108,9 @@ def __iter__(self) -> Iterable[Tuple[blocks.Label, series.Series]]: filtered_series.name = self._value_name yield group_keys, filtered_series + def __len__(self) -> int: + return len(self.agg([])) + def all(self) -> series.Series: return self._aggregate(agg_ops.all_op) @@ -275,9 +278,9 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]: if column_names: agg_block = agg_block.with_column_labels(column_names) - if len(aggregations) > 1: - return df.DataFrame(agg_block) - return series.Series(agg_block) + if len(aggregations) == 1: + return series.Series(agg_block) + return df.DataFrame(agg_block) aggregate = agg diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 54d8228ff6..0e82b6dea7 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -43,6 +43,7 @@ if typing.TYPE_CHECKING: import bigframes.dataframe + import bigframes.operations.strings import bigframes.series @@ -254,6 +255,12 @@ def query_job(self) -> bigquery.QueryJob: self._query_job = query_job return self._query_job + @property + def str(self) -> bigframes.operations.strings.StringMethods: + import bigframes.operations.strings + + return bigframes.operations.strings.StringMethods(self) + def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]: """Get integer location, slice or boolean mask for requested label. @@ -317,7 +324,9 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]: result_series = bigframes.series.Series(mask_block) return result_series.astype("boolean") - def _get_monotonic_slice(self, filtered_block, offsets_id: str) -> slice: + def _get_monotonic_slice( + self, filtered_block, offsets_id: __builtins__.str + ) -> slice: """Helper method to get a slice for monotonic duplicates with an optimized query.""" # Combine min and max aggregations into a single query for efficiency min_max_aggs = [ @@ -343,7 +352,7 @@ def _get_monotonic_slice(self, filtered_block, offsets_id: str) -> slice: # Create slice (stop is exclusive) return slice(min_pos, max_pos + 1) - def __repr__(self) -> str: + def __repr__(self) -> __builtins__.str: # Protect against errors with uninitialized Series. See: # https://github.com/googleapis/python-bigquery-dataframes/issues/728 if not hasattr(self, "_block"): @@ -417,7 +426,7 @@ def sort_values( *, inplace: bool = False, ascending: bool = True, - na_position: str = "last", + na_position: __builtins__.str = "last", ) -> Index: if na_position not in ["first", "last"]: raise ValueError("Param na_position must be one of 'first' or 'last'") @@ -604,7 +613,7 @@ def dropna(self, how: typing.Literal["all", "any"] = "any") -> Index: result = block_ops.dropna(self._block, self._block.index_columns, how=how) return Index(result) - def drop_duplicates(self, *, keep: str = "first") -> Index: + def drop_duplicates(self, *, keep: __builtins__.str = "first") -> Index: if keep is not False: validations.enforce_ordered(self, "drop_duplicates") block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep) @@ -656,6 +665,9 @@ def __contains__(self, key) -> bool: block, match_col = self._block.project_expr(match_expr_final) return cast(bool, block.get_stat(match_col, agg_ops.AnyOp())) + def _apply_unary_op(self, op: ops.UnaryOp) -> Index: + return self._apply_unary_expr(op.as_expr(ex.free_var("input"))) + def _apply_unary_expr( self, op: ex.Expression, @@ -762,9 +774,15 @@ def item(self): return self.to_series().peek(2).item() def __eq__(self, other) -> Index: # type: ignore - return self._apply_binop(other, ops.eq_op) + return self._apply_binary_op(other, ops.eq_op) - def _apply_binop(self, other, op: ops.BinaryOp) -> Index: + def _apply_binary_op( + self, + other, + op: ops.BinaryOp, + alignment: typing.Literal["outer", "left"] = "outer", + ) -> Index: + # Note: alignment arg is for compatibility with accessors, is ignored as irrelevant for implicit joins. # TODO: Handle local objects, or objects not implicitly alignable? Gets ambiguous with partial ordering though if isinstance(other, (bigframes.series.Series, Index)): other = Index(other) @@ -785,12 +803,13 @@ def _apply_binop(self, other, op: ops.BinaryOp) -> Index: for lid, rid in zip(lexpr.column_ids, rexpr.column_ids) ] ) + labels = self.names if self.names == other.names else [None] * len(res_ids) return Index( blocks.Block( expr.select_columns(res_ids), index_columns=res_ids, column_labels=[], - index_labels=[None] * len(res_ids), + index_labels=labels, ) ) elif ( @@ -799,7 +818,7 @@ def _apply_binop(self, other, op: ops.BinaryOp) -> Index: block, id = self._block.project_expr( op.as_expr(self._block.index_columns[0], ex.const(other)) ) - return Index(block.select_column(id)) + return Index(block.set_index([id], index_labels=self.names)) elif isinstance(other, tuple) and len(other) == self.nlevels: block = self._block.project_exprs( [ @@ -809,7 +828,7 @@ def _apply_binop(self, other, op: ops.BinaryOp) -> Index: labels=[None] * self.nlevels, drop=True, ) - return Index(block.set_index(block.value_columns)) + return Index(block.set_index(block.value_columns, index_labels=self.names)) else: return NotImplemented diff --git a/bigframes/core/indexes/multi.py b/bigframes/core/indexes/multi.py index a611442b88..cfabd9e70d 100644 --- a/bigframes/core/indexes/multi.py +++ b/bigframes/core/indexes/multi.py @@ -60,7 +60,7 @@ def __eq__(self, other) -> Index: # type: ignore import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops - eq_result = self._apply_binop(other, ops.eq_op)._block.expr + eq_result = self._apply_binary_op(other, ops.eq_op)._block.expr as_array = ops.ToArrayOp().as_expr( *( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 8504a5bb95..c3735ca3c2 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1306,6 +1306,11 @@ def __pos__(self) -> DataFrame: def __neg__(self) -> DataFrame: return self._apply_unary_op(ops.neg_op) + def __abs__(self) -> DataFrame: + return self._apply_unary_op(ops.abs_op) + + __abs__.__doc__ = abs.__doc__ + def align( self, other: typing.Union[DataFrame, bigframes.series.Series], diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 18ecdede11..6c05b6f4a3 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -340,6 +340,12 @@ def is_struct_like(type_: ExpressionType) -> bool: ) +def is_json_arrow_type(type_: pa.DataType) -> bool: + return isinstance(type_, db_dtypes.JSONArrowType) or ( + hasattr(pa, "JsonType") and isinstance(type_, pa.JsonType) + ) + + def is_json_like(type_: ExpressionType) -> bool: return type_ == JSON_DTYPE or type_ == STRING_DTYPE # Including JSON string @@ -510,6 +516,10 @@ def arrow_dtype_to_bigframes_dtype( if arrow_dtype == pa.null(): return DEFAULT_DTYPE + # Allow both db_dtypes.JSONArrowType() and pa.json_(pa.string()) + if is_json_arrow_type(arrow_dtype): + return JSON_DTYPE + # No other types matched. raise TypeError( f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}" diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index afdc924c0b..83e2078c88 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -348,3 +348,19 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT name="unsafe_pow_op", type_signature=op_typing.BINARY_REAL_NUMERIC ) unsafe_pow_op = UnsafePowOp() + +IsNanOp = base_ops.create_unary_op( + name="isnan", + type_signature=op_typing.FixedOutputType( + dtypes.is_numeric, dtypes.BOOL_DTYPE, "numeric" + ), +) +isnan_op = IsNanOp() + +IsFiniteOp = base_ops.create_unary_op( + name="isfinite", + type_signature=op_typing.FixedOutputType( + dtypes.is_numeric, dtypes.BOOL_DTYPE, "numeric" + ), +) +isfinite_op = IsFiniteOp() diff --git a/bigframes/operations/numpy_op_maps.py b/bigframes/operations/numpy_op_maps.py index 7f3decdfa0..791e2eb890 100644 --- a/bigframes/operations/numpy_op_maps.py +++ b/bigframes/operations/numpy_op_maps.py @@ -40,6 +40,8 @@ np.ceil: numeric_ops.ceil_op, np.log1p: numeric_ops.log1p_op, np.expm1: numeric_ops.expm1_op, + np.isnan: numeric_ops.isnan_op, + np.isfinite: numeric_ops.isfinite_op, } diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 3288be591c..d84a66789d 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -15,12 +15,13 @@ from __future__ import annotations import re -from typing import Literal, Optional, Union +from typing import Generic, Hashable, Literal, Optional, TypeVar, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.strings.accessor as vendorstr from bigframes.core import log_adapter +import bigframes.core.indexes.base as indices import bigframes.dataframe as df import bigframes.operations as ops from bigframes.operations._op_converters import convert_index, convert_slice @@ -34,15 +35,17 @@ re.DOTALL: "s", } +T = TypeVar("T", series.Series, indices.Index) + @log_adapter.class_logger -class StringMethods(vendorstr.StringMethods): +class StringMethods(vendorstr.StringMethods, Generic[T]): __doc__ = vendorstr.StringMethods.__doc__ - def __init__(self, data: series.Series): - self._data = data + def __init__(self, data: T): + self._data: T = data - def __getitem__(self, key: Union[int, slice]) -> series.Series: + def __getitem__(self, key: Union[int, slice]) -> T: if isinstance(key, int): return self._data._apply_unary_op(convert_index(key)) elif isinstance(key, slice): @@ -55,18 +58,18 @@ def find( sub: str, start: Optional[int] = None, end: Optional[int] = None, - ) -> series.Series: + ) -> T: return self._data._apply_unary_op( ops.StrFindOp(substr=sub, start=start, end=end) ) - def len(self) -> series.Series: + def len(self) -> T: return self._data._apply_unary_op(ops.len_op) - def lower(self) -> series.Series: + def lower(self) -> T: return self._data._apply_unary_op(ops.lower_op) - def reverse(self) -> series.Series: + def reverse(self) -> T: """Reverse strings in the Series. **Examples:** @@ -91,103 +94,103 @@ def slice( self, start: Optional[int] = None, stop: Optional[int] = None, - ) -> series.Series: + ) -> T: return self._data._apply_unary_op(ops.StrSliceOp(start=start, end=stop)) - def strip(self, to_strip: Optional[str] = None) -> series.Series: + def strip(self, to_strip: Optional[str] = None) -> T: return self._data._apply_unary_op( ops.StrStripOp(to_strip=" \n\t" if to_strip is None else to_strip) ) - def upper(self) -> series.Series: + def upper(self) -> T: return self._data._apply_unary_op(ops.upper_op) - def isnumeric(self) -> series.Series: + def isnumeric(self) -> T: return self._data._apply_unary_op(ops.isnumeric_op) def isalpha( self, - ) -> series.Series: + ) -> T: return self._data._apply_unary_op(ops.isalpha_op) def isdigit( self, - ) -> series.Series: + ) -> T: return self._data._apply_unary_op(ops.isdigit_op) def isdecimal( self, - ) -> series.Series: + ) -> T: return self._data._apply_unary_op(ops.isdecimal_op) def isalnum( self, - ) -> series.Series: + ) -> T: return self._data._apply_unary_op(ops.isalnum_op) def isspace( self, - ) -> series.Series: + ) -> T: return self._data._apply_unary_op(ops.isspace_op) def islower( self, - ) -> series.Series: + ) -> T: return self._data._apply_unary_op(ops.islower_op) def isupper( self, - ) -> series.Series: + ) -> T: return self._data._apply_unary_op(ops.isupper_op) - def rstrip(self, to_strip: Optional[str] = None) -> series.Series: + def rstrip(self, to_strip: Optional[str] = None) -> T: return self._data._apply_unary_op( ops.StrRstripOp(to_strip=" \n\t" if to_strip is None else to_strip) ) - def lstrip(self, to_strip: Optional[str] = None) -> series.Series: + def lstrip(self, to_strip: Optional[str] = None) -> T: return self._data._apply_unary_op( ops.StrLstripOp(to_strip=" \n\t" if to_strip is None else to_strip) ) - def repeat(self, repeats: int) -> series.Series: + def repeat(self, repeats: int) -> T: return self._data._apply_unary_op(ops.StrRepeatOp(repeats=repeats)) - def capitalize(self) -> series.Series: + def capitalize(self) -> T: return self._data._apply_unary_op(ops.capitalize_op) - def match(self, pat, case=True, flags=0) -> series.Series: + def match(self, pat, case=True, flags=0) -> T: # \A anchors start of entire string rather than start of any line in multiline mode adj_pat = rf"\A{pat}" return self.contains(pat=adj_pat, case=case, flags=flags) - def fullmatch(self, pat, case=True, flags=0) -> series.Series: + def fullmatch(self, pat, case=True, flags=0) -> T: # \A anchors start of entire string rather than start of any line in multiline mode # \z likewise anchors to the end of the entire multiline string adj_pat = rf"\A{pat}\z" return self.contains(pat=adj_pat, case=case, flags=flags) - def get(self, i: int) -> series.Series: + def get(self, i: int) -> T: return self._data._apply_unary_op(ops.StrGetOp(i=i)) - def pad(self, width, side="left", fillchar=" ") -> series.Series: + def pad(self, width, side="left", fillchar=" ") -> T: return self._data._apply_unary_op( ops.StrPadOp(length=width, fillchar=fillchar, side=side) ) - def ljust(self, width, fillchar=" ") -> series.Series: + def ljust(self, width, fillchar=" ") -> T: return self._data._apply_unary_op( ops.StrPadOp(length=width, fillchar=fillchar, side="right") ) - def rjust(self, width, fillchar=" ") -> series.Series: + def rjust(self, width, fillchar=" ") -> T: return self._data._apply_unary_op( ops.StrPadOp(length=width, fillchar=fillchar, side="left") ) def contains( self, pat, case: bool = True, flags: int = 0, *, regex: bool = True - ) -> series.Series: + ) -> T: if not case: return self.contains(pat=pat, flags=flags | re.IGNORECASE, regex=True) if regex: @@ -206,23 +209,19 @@ def extract(self, pat: str, flags: int = 0) -> df.DataFrame: if compiled.groups == 0: raise ValueError("No capture groups in 'pat'") - results: list[str] = [] - block = self._data._block + results: dict[Hashable, series.Series] = {} for i in range(compiled.groups): labels = [ label for label, groupn in compiled.groupindex.items() if i + 1 == groupn ] - label = labels[0] if labels else str(i) - block, id = block.apply_unary_op( - self._data._value_column, + label = labels[0] if labels else i + result = self._data._apply_unary_op( ops.StrExtractOp(pat=pat, n=i + 1), - result_label=label, ) - results.append(id) - block = block.select_columns(results) - return df.DataFrame(block) + results[label] = series.Series(result) + return df.DataFrame(results) def replace( self, @@ -232,7 +231,7 @@ def replace( case: Optional[bool] = None, flags: int = 0, regex: bool = False, - ) -> series.Series: + ) -> T: if isinstance(pat, re.Pattern): assert isinstance(pat.pattern, str) pat_str = pat.pattern @@ -259,7 +258,7 @@ def replace( def startswith( self, pat: Union[str, tuple[str, ...]], - ) -> series.Series: + ) -> T: if not isinstance(pat, tuple): pat = (pat,) return self._data._apply_unary_op(ops.StartsWithOp(pat=pat)) @@ -267,7 +266,7 @@ def startswith( def endswith( self, pat: Union[str, tuple[str, ...]], - ) -> series.Series: + ) -> T: if not isinstance(pat, tuple): pat = (pat,) return self._data._apply_unary_op(ops.EndsWithOp(pat=pat)) @@ -276,7 +275,7 @@ def split( self, pat: str = " ", regex: Union[bool, None] = None, - ) -> series.Series: + ) -> T: if regex is True or (regex is None and len(pat) > 1): raise NotImplementedError( "Regular expressions aren't currently supported. Please set " @@ -284,28 +283,28 @@ def split( ) return self._data._apply_unary_op(ops.StringSplitOp(pat=pat)) - def zfill(self, width: int) -> series.Series: + def zfill(self, width: int) -> T: return self._data._apply_unary_op(ops.ZfillOp(width=width)) - def center(self, width: int, fillchar: str = " ") -> series.Series: + def center(self, width: int, fillchar: str = " ") -> T: return self._data._apply_unary_op( ops.StrPadOp(length=width, fillchar=fillchar, side="both") ) def cat( self, - others: Union[str, series.Series], + others: Union[str, indices.Index, series.Series], *, join: Literal["outer", "left"] = "left", - ) -> series.Series: + ) -> T: return self._data._apply_binary_op(others, ops.strconcat_op, alignment=join) - def join(self, sep: str) -> series.Series: + def join(self, sep: str) -> T: return self._data._apply_unary_op( ops.ArrayReduceOp(aggregation=agg_ops.StringAggOp(sep=sep)) ) - def to_blob(self, connection: Optional[str] = None) -> series.Series: + def to_blob(self, connection: Optional[str] = None) -> T: """Create a BigFrames Blob series from a series of URIs. .. note:: diff --git a/bigframes/series.py b/bigframes/series.py index f08fc6cc14..ad1f091803 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -74,12 +74,13 @@ import bigframes.operations.datetimes as dt import bigframes.operations.lists as lists import bigframes.operations.plotting as plotting -import bigframes.operations.strings as strings import bigframes.operations.structs as structs import bigframes.session if typing.TYPE_CHECKING: import bigframes.geopandas.geoseries + import bigframes.operations.strings as strings + LevelType = typing.Union[str, int] LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] @@ -1363,6 +1364,8 @@ def update(self, other: Union[Series, Sequence, Mapping]) -> None: def __abs__(self) -> Series: return self.abs() + __abs__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.abs) + def abs(self) -> Series: return self._apply_unary_op(ops.abs_op) @@ -2649,6 +2652,8 @@ def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series: # confusing type checker by overriding str @property def str(self) -> strings.StringMethods: + import bigframes.operations.strings as strings + return strings.StringMethods(self) @property diff --git a/bigframes/session/dry_runs.py b/bigframes/session/dry_runs.py index 51e8e72c9a..bd54bb65d7 100644 --- a/bigframes/session/dry_runs.py +++ b/bigframes/session/dry_runs.py @@ -20,6 +20,7 @@ import pandas from bigframes import dtypes +from bigframes.core import bigframe_node, nodes def get_table_stats(table: bigquery.Table) -> pandas.Series: @@ -86,13 +87,26 @@ def get_query_stats_with_dtypes( query_job: bigquery.QueryJob, column_dtypes: Dict[str, dtypes.Dtype], index_dtypes: Sequence[dtypes.Dtype], + expr_root: bigframe_node.BigFrameNode | None = None, ) -> pandas.Series: + """ + Returns important stats from the query job as a Pandas Series. The dtypes information is added too. + + Args: + expr_root (Optional): + The root of the expression tree that may contain local data, whose size is added to the + total bytes count if available. + + """ index = ["columnCount", "columnDtypes", "indexLevel", "indexDtypes"] values = [len(column_dtypes), column_dtypes, len(index_dtypes), index_dtypes] s = pandas.Series(values, index=index) - return pandas.concat([s, get_query_stats(query_job)]) + result = pandas.concat([s, get_query_stats(query_job)]) + if expr_root is not None: + result["totalBytesProcessed"] += get_local_bytes(expr_root) + return result def get_query_stats( @@ -145,4 +159,24 @@ def get_query_stats( else None ) - return pandas.Series(values, index=index) + result = pandas.Series(values, index=index) + if result["totalBytesProcessed"] is None: + result["totalBytesProcessed"] = 0 + else: + result["totalBytesProcessed"] = int(result["totalBytesProcessed"]) + + return result + + +def get_local_bytes(root: bigframe_node.BigFrameNode) -> int: + def get_total_bytes( + root: bigframe_node.BigFrameNode, child_results: tuple[int, ...] + ) -> int: + child_bytes = sum(child_results) + + if isinstance(root, nodes.ReadLocalNode): + return child_bytes + root.local_data_source.data.get_total_buffer_size() + + return child_bytes + + return root.reduce_up(get_total_bytes) diff --git a/bigframes/version.py b/bigframes/version.py index 6fe84df0ab..4e319dd41d 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.26.0" +__version__ = "2.27.0" # {x-release-please-start-date} -__release_date__ = "2025-10-17" +__release_date__ = "2025-10-24" # {x-release-please-end} diff --git a/notebooks/experimental/ai_operators.ipynb b/notebooks/experimental/ai_operators.ipynb index 8aaa3f4b7c..9878929cd2 100644 --- a/notebooks/experimental/ai_operators.ipynb +++ b/notebooks/experimental/ai_operators.ipynb @@ -29,7 +29,7 @@ "id": "rWJnGj2ViouP" }, "source": [ - "All AI operators except for `ai.forecast` have been deprecated.\n", + "All AI operators except for `ai.forecast` have moved to the `bigframes.bigquery.ai` module.\n", "\n", "The tutorial notebook for AI functions is located at https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/ai_functions.ipynb\n", "\n", diff --git a/noxfile.py b/noxfile.py index f9c20c999c..8334fcb0e1 100644 --- a/noxfile.py +++ b/noxfile.py @@ -193,7 +193,7 @@ def format(session): @nox.session(python=DEFAULT_PYTHON_VERSION) def lint_setup_py(session): """Verify that setup.py is valid (including RST check).""" - session.install("docutils", "pygments") + session.install("docutils", "pygments", "setuptools") session.run("python", "setup.py", "check", "--restructuredtext", "--strict") session.install("twine", "wheel") diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py index 8f305bcc0f..43f7df4dd6 100644 --- a/scripts/publish_api_coverage.py +++ b/scripts/publish_api_coverage.py @@ -30,6 +30,7 @@ import bigframes.core.groupby import bigframes.core.window import bigframes.operations.datetimes +import bigframes.operations.strings import bigframes.pandas as bpd REPO_ROOT = pathlib.Path(__file__).parent.parent diff --git a/tests/system/small/engines/test_generic_ops.py b/tests/system/small/engines/test_generic_ops.py index f209b95496..5641f91a9a 100644 --- a/tests/system/small/engines/test_generic_ops.py +++ b/tests/system/small/engines/test_generic_ops.py @@ -454,7 +454,7 @@ def test_engines_isin_op(scalars_array_value: array_value.ArrayValue, engine): assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) def test_engines_isin_op_nested_filter( scalars_array_value: array_value.ArrayValue, engine ): diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 6cd6309cbb..657fc231d1 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -78,8 +78,6 @@ def test_str_extract(scalars_dfs, pat): bf_result = bf_series.str.extract(pat).to_pandas() pd_result = scalars_pandas_df[col_name].str.extract(pat) - # Pandas produces int col labels, while bq df only supports str labels at present - pd_result = pd_result.set_axis(pd_result.columns.astype(str), axis=1) pd.testing.assert_frame_equal( pd_result, bf_result, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 34bb5a4fb3..79f8efd00f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2460,6 +2460,16 @@ def test_df_neg(scalars_dfs): assert_pandas_df_equal(pd_result, bf_result) +def test_df__abs__(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + abs(scalars_df[["int64_col", "numeric_col", "float64_col"]]) + ).to_pandas() + pd_result = abs(scalars_pandas_df[["int64_col", "numeric_col", "float64_col"]]) + + assert_pandas_df_equal(pd_result, bf_result) + + def test_df_invert(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs columns = ["int64_col", "bool_col"] diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 553a12a14a..4f187dcccc 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -61,6 +61,15 @@ def test_dataframe_groupby_head(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) +def test_dataframe_groupby_len(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] + + bf_result = len(scalars_df_index[col_names].groupby("bool_col")) + pd_result = len(scalars_pandas_df_index[col_names].groupby("bool_col")) + + assert bf_result == pd_result + + def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = ( @@ -161,6 +170,26 @@ def test_dataframe_groupby_aggregate( pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +def test_dataframe_groupby_corr(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] + bf_result = scalars_df_index[col_names].groupby("bool_col").corr().to_pandas() + pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").corr() + + pd.testing.assert_frame_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_dataframe_groupby_cov(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] + bf_result = scalars_df_index[col_names].groupby("bool_col").cov().to_pandas() + pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").cov() + + pd.testing.assert_frame_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("ordered"), [ @@ -668,6 +697,13 @@ def test_dataframe_groupby_last( # ============== +def test_series_groupby_len(scalars_df_index, scalars_pandas_df_index): + bf_result = len(scalars_df_index.groupby("bool_col")["int64_col"]) + pd_result = len(scalars_pandas_df_index.groupby("bool_col")["int64_col"]) + + assert bf_result == pd_result + + @pytest.mark.parametrize( ("agg"), [ diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 3fe479af6e..0ec1fb6143 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -685,3 +685,39 @@ def test_index_eq_aligned_index(scalars_df_index, scalars_pandas_df_index): scalars_pandas_df_index.int64_col.abs() ) assert bf_result == pd.Index(pd_result) + + +def test_index_str_accessor_unary(scalars_df_index, scalars_pandas_df_index): + bf_index = scalars_df_index.set_index("string_col").index + pd_index = scalars_pandas_df_index.set_index("string_col").index + + bf_result = bf_index.str.pad(30, side="both", fillchar="~").to_pandas() + pd_result = pd_index.str.pad(30, side="both", fillchar="~") + + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_str_accessor_binary(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("1."): + pytest.skip("doesn't work in pandas 1.x.") + bf_index = scalars_df_index.set_index("string_col").index + pd_index = scalars_pandas_df_index.set_index("string_col").index + + bf_result = bf_index.str.cat(bf_index.str[:4]).to_pandas() + pd_result = pd_index.str.cat(pd_index.str[:4]) + + pd.testing.assert_index_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("pat"), + [(r"(ell)(lo)"), (r"(?Ph..)"), (r"(?Pe.*o)([g-l]+)")], +) +def test_index_str_extract(scalars_df_index, scalars_pandas_df_index, pat): + bf_index = scalars_df_index.set_index("string_col").index + pd_index = scalars_pandas_df_index.set_index("string_col").index + + bf_result = bf_index.str.extract(pat).to_pandas() + pd_result = pd_index.str.extract(pat) + + pd.testing.assert_frame_equal(pd_result, bf_result, check_index_type=False) diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py index 37a707b9d0..490f927114 100644 --- a/tests/system/small/test_numpy.py +++ b/tests/system/small/test_numpy.py @@ -37,6 +37,8 @@ ("log10",), ("sqrt",), ("abs",), + ("isnan",), + ("isfinite",), ], ) def test_series_ufuncs(floats_pd, floats_bf, opname): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index df538329ce..5ace3f54d8 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -353,6 +353,41 @@ def test_series_construct_w_json_dtype(json_type): assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' +def test_series_construct_w_nested_json_dtype(): + list_data = [ + [{"key": "1"}], + [{"key": None}], + [{"key": '["1","3","5"]'}], + [{"key": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}], + ] + pa_array = pa.array(list_data, type=pa.list_(pa.struct([("key", pa.string())]))) + + db_json_arrow_dtype = db_dtypes.JSONArrowType() + s = bigframes.pandas.Series( + pd.arrays.ArrowExtensionArray(pa_array), # type: ignore + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("key", db_json_arrow_dtype)])), + ), + ) + + assert s[0][0]["key"] == "1" + assert not s[1][0]["key"] + assert s[2][0]["key"] == '["1","3","5"]' + assert s[3][0]["key"] == '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}' + + # Test with pyarrow.json_(pa.string()) if available. + if hasattr(pa, "JsonType"): + pyarrow_json_dtype = pa.json_(pa.string()) + s2 = bigframes.pandas.Series( + pd.arrays.ArrowExtensionArray(pa_array), # type: ignore + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("key", pyarrow_json_dtype)])), + ), + ) + + pd.testing.assert_series_equal(s.to_pandas(), s2.to_pandas()) + + def test_series_keys(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["int64_col"].keys().to_pandas() diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 001e02c2fa..d3e646dc92 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -2173,6 +2173,22 @@ def test_read_gbq_query_dry_run(scalars_table_id, session): _assert_query_dry_run_stats_are_valid(result) +def test_block_dry_run_includes_local_data(session): + df1 = bigframes.dataframe.DataFrame({"col_1": [1, 2, 3]}, session=session) + df2 = bigframes.dataframe.DataFrame({"col_2": [1, 2, 3]}, session=session) + + result = df1.merge(df2, how="cross").to_pandas(dry_run=True) + + assert isinstance(result, pd.Series) + _assert_query_dry_run_stats_are_valid(result) + assert result["totalBytesProcessed"] > 0 + assert ( + df1.to_pandas(dry_run=True)["totalBytesProcessed"] + + df2.to_pandas(dry_run=True)["totalBytesProcessed"] + == result["totalBytesProcessed"] + ) + + def _assert_query_dry_run_stats_are_valid(result: pd.Series): expected_index = pd.Index( [ diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_invert/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_invert/out.sql index b5a5b92b52..bf005efb05 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_invert/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_generic_ops/test_invert/out.sql @@ -7,9 +7,15 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - ~`bfcol_2` AS `bfcol_6`, - ~`bfcol_1` AS `bfcol_7`, - NOT `bfcol_0` AS `bfcol_8` + ~( + `bfcol_2` + ) AS `bfcol_6`, + ~( + `bfcol_1` + ) AS `bfcol_7`, + NOT ( + `bfcol_0` + ) AS `bfcol_8` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_string/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_string/out.sql new file mode 100644 index 0000000000..de5129a6a3 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_string/out.sql @@ -0,0 +1,13 @@ +WITH `bfcte_0` AS ( + SELECT + `string_col` AS `bfcol_0` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + CONCAT(`bfcol_0`, 'a') AS `bfcol_1` + FROM `bfcte_0` +) +SELECT + `bfcol_1` AS `string_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_timedelta/out.sql new file mode 100644 index 0000000000..a47531999b --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_add_timedelta/out.sql @@ -0,0 +1,60 @@ +WITH `bfcte_0` AS ( + SELECT + `date_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1`, + `timestamp_col` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_6`, + `bfcol_2` AS `bfcol_7`, + `bfcol_0` AS `bfcol_8`, + TIMESTAMP_ADD(CAST(`bfcol_0` AS DATETIME), INTERVAL 86400000000 MICROSECOND) AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_6` AS `bfcol_14`, + `bfcol_7` AS `bfcol_15`, + `bfcol_8` AS `bfcol_16`, + `bfcol_9` AS `bfcol_17`, + TIMESTAMP_ADD(`bfcol_7`, INTERVAL 86400000000 MICROSECOND) AS `bfcol_18` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + *, + `bfcol_14` AS `bfcol_24`, + `bfcol_15` AS `bfcol_25`, + `bfcol_16` AS `bfcol_26`, + `bfcol_17` AS `bfcol_27`, + `bfcol_18` AS `bfcol_28`, + TIMESTAMP_ADD(CAST(`bfcol_16` AS DATETIME), INTERVAL 86400000000 MICROSECOND) AS `bfcol_29` + FROM `bfcte_2` +), `bfcte_4` AS ( + SELECT + *, + `bfcol_24` AS `bfcol_36`, + `bfcol_25` AS `bfcol_37`, + `bfcol_26` AS `bfcol_38`, + `bfcol_27` AS `bfcol_39`, + `bfcol_28` AS `bfcol_40`, + `bfcol_29` AS `bfcol_41`, + TIMESTAMP_ADD(`bfcol_25`, INTERVAL 86400000000 MICROSECOND) AS `bfcol_42` + FROM `bfcte_3` +), `bfcte_5` AS ( + SELECT + *, + 172800000000 AS `bfcol_50` + FROM `bfcte_4` +) +SELECT + `bfcol_36` AS `rowindex`, + `bfcol_37` AS `timestamp_col`, + `bfcol_38` AS `date_col`, + `bfcol_39` AS `date_add_timedelta`, + `bfcol_40` AS `timestamp_add_timedelta`, + `bfcol_41` AS `timedelta_add_date`, + `bfcol_42` AS `timedelta_add_timestamp`, + `bfcol_50` AS `timedelta_add_timedelta` +FROM `bfcte_5` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql index 7913b43aa6..64f456a72d 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mod_numeric/out.sql @@ -38,23 +38,43 @@ WITH `bfcte_0` AS ( `bfcol_8` AS `bfcol_16`, `bfcol_9` AS `bfcol_17`, CASE - WHEN -`bfcol_7` = CAST(0 AS INT64) + WHEN -( + `bfcol_7` + ) = CAST(0 AS INT64) THEN CAST(0 AS INT64) * `bfcol_7` - WHEN -`bfcol_7` < CAST(0 AS INT64) + WHEN -( + `bfcol_7` + ) < CAST(0 AS INT64) AND ( - MOD(`bfcol_7`, -`bfcol_7`) + MOD(`bfcol_7`, -( + `bfcol_7` + )) ) > CAST(0 AS INT64) - THEN -`bfcol_7` + ( - MOD(`bfcol_7`, -`bfcol_7`) + THEN -( + `bfcol_7` + ) + ( + MOD(`bfcol_7`, -( + `bfcol_7` + )) ) - WHEN -`bfcol_7` > CAST(0 AS INT64) + WHEN -( + `bfcol_7` + ) > CAST(0 AS INT64) AND ( - MOD(`bfcol_7`, -`bfcol_7`) + MOD(`bfcol_7`, -( + `bfcol_7` + )) ) < CAST(0 AS INT64) - THEN -`bfcol_7` + ( - MOD(`bfcol_7`, -`bfcol_7`) + THEN -( + `bfcol_7` + ) + ( + MOD(`bfcol_7`, -( + `bfcol_7` + )) ) - ELSE MOD(`bfcol_7`, -`bfcol_7`) + ELSE MOD(`bfcol_7`, -( + `bfcol_7` + )) END AS `bfcol_18` FROM `bfcte_1` ), `bfcte_3` AS ( @@ -152,23 +172,43 @@ WITH `bfcte_0` AS ( `bfcol_56` AS `bfcol_72`, `bfcol_57` AS `bfcol_73`, CASE - WHEN CAST(-`bfcol_52` AS BIGNUMERIC) = CAST(0 AS INT64) + WHEN CAST(-( + `bfcol_52` + ) AS BIGNUMERIC) = CAST(0 AS INT64) THEN CAST('NaN' AS FLOAT64) * CAST(`bfcol_52` AS BIGNUMERIC) - WHEN CAST(-`bfcol_52` AS BIGNUMERIC) < CAST(0 AS INT64) + WHEN CAST(-( + `bfcol_52` + ) AS BIGNUMERIC) < CAST(0 AS INT64) AND ( - MOD(CAST(`bfcol_52` AS BIGNUMERIC), CAST(-`bfcol_52` AS BIGNUMERIC)) + MOD(CAST(`bfcol_52` AS BIGNUMERIC), CAST(-( + `bfcol_52` + ) AS BIGNUMERIC)) ) > CAST(0 AS INT64) - THEN CAST(-`bfcol_52` AS BIGNUMERIC) + ( - MOD(CAST(`bfcol_52` AS BIGNUMERIC), CAST(-`bfcol_52` AS BIGNUMERIC)) + THEN CAST(-( + `bfcol_52` + ) AS BIGNUMERIC) + ( + MOD(CAST(`bfcol_52` AS BIGNUMERIC), CAST(-( + `bfcol_52` + ) AS BIGNUMERIC)) ) - WHEN CAST(-`bfcol_52` AS BIGNUMERIC) > CAST(0 AS INT64) + WHEN CAST(-( + `bfcol_52` + ) AS BIGNUMERIC) > CAST(0 AS INT64) AND ( - MOD(CAST(`bfcol_52` AS BIGNUMERIC), CAST(-`bfcol_52` AS BIGNUMERIC)) + MOD(CAST(`bfcol_52` AS BIGNUMERIC), CAST(-( + `bfcol_52` + ) AS BIGNUMERIC)) ) < CAST(0 AS INT64) - THEN CAST(-`bfcol_52` AS BIGNUMERIC) + ( - MOD(CAST(`bfcol_52` AS BIGNUMERIC), CAST(-`bfcol_52` AS BIGNUMERIC)) + THEN CAST(-( + `bfcol_52` + ) AS BIGNUMERIC) + ( + MOD(CAST(`bfcol_52` AS BIGNUMERIC), CAST(-( + `bfcol_52` + ) AS BIGNUMERIC)) ) - ELSE MOD(CAST(`bfcol_52` AS BIGNUMERIC), CAST(-`bfcol_52` AS BIGNUMERIC)) + ELSE MOD(CAST(`bfcol_52` AS BIGNUMERIC), CAST(-( + `bfcol_52` + ) AS BIGNUMERIC)) END AS `bfcol_74` FROM `bfcte_5` ), `bfcte_7` AS ( diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_timedelta/out.sql new file mode 100644 index 0000000000..f8752d0a60 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_mul_timedelta/out.sql @@ -0,0 +1,43 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1`, + `timestamp_col` AS `bfcol_2`, + `duration_col` AS `bfcol_3` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_8`, + `bfcol_2` AS `bfcol_9`, + `bfcol_0` AS `bfcol_10`, + `bfcol_3` AS `bfcol_11` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_8` AS `bfcol_16`, + `bfcol_9` AS `bfcol_17`, + `bfcol_10` AS `bfcol_18`, + `bfcol_11` AS `bfcol_19`, + CAST(FLOOR(`bfcol_11` * `bfcol_10`) AS INT64) AS `bfcol_20` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + *, + `bfcol_16` AS `bfcol_26`, + `bfcol_17` AS `bfcol_27`, + `bfcol_18` AS `bfcol_28`, + `bfcol_19` AS `bfcol_29`, + `bfcol_20` AS `bfcol_30`, + CAST(FLOOR(`bfcol_18` * `bfcol_19`) AS INT64) AS `bfcol_31` + FROM `bfcte_2` +) +SELECT + `bfcol_26` AS `rowindex`, + `bfcol_27` AS `timestamp_col`, + `bfcol_28` AS `int64_col`, + `bfcol_29` AS `duration_col`, + `bfcol_30` AS `timedelta_mul_numeric`, + `bfcol_31` AS `numeric_mul_timedelta` +FROM `bfcte_3` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_neg/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_neg/out.sql index 46c58f766d..39bdd6da7f 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_neg/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_neg/out.sql @@ -5,7 +5,9 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - -`bfcol_0` AS `bfcol_1` + -( + `bfcol_0` + ) AS `bfcol_1` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_timedelta/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_timedelta/out.sql new file mode 100644 index 0000000000..2d615fcca6 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_sub_timedelta/out.sql @@ -0,0 +1,82 @@ +WITH `bfcte_0` AS ( + SELECT + `date_col` AS `bfcol_0`, + `rowindex` AS `bfcol_1`, + `timestamp_col` AS `bfcol_2`, + `duration_col` AS `bfcol_3` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_1` AS `bfcol_8`, + `bfcol_2` AS `bfcol_9`, + `bfcol_0` AS `bfcol_10`, + `bfcol_3` AS `bfcol_11` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_8` AS `bfcol_16`, + `bfcol_9` AS `bfcol_17`, + `bfcol_11` AS `bfcol_18`, + `bfcol_10` AS `bfcol_19`, + TIMESTAMP_SUB(CAST(`bfcol_10` AS DATETIME), INTERVAL `bfcol_11` MICROSECOND) AS `bfcol_20` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + *, + `bfcol_16` AS `bfcol_26`, + `bfcol_17` AS `bfcol_27`, + `bfcol_18` AS `bfcol_28`, + `bfcol_19` AS `bfcol_29`, + `bfcol_20` AS `bfcol_30`, + TIMESTAMP_SUB(`bfcol_17`, INTERVAL `bfcol_18` MICROSECOND) AS `bfcol_31` + FROM `bfcte_2` +), `bfcte_4` AS ( + SELECT + *, + `bfcol_26` AS `bfcol_38`, + `bfcol_27` AS `bfcol_39`, + `bfcol_28` AS `bfcol_40`, + `bfcol_29` AS `bfcol_41`, + `bfcol_30` AS `bfcol_42`, + `bfcol_31` AS `bfcol_43`, + TIMESTAMP_DIFF(CAST(`bfcol_29` AS DATETIME), CAST(`bfcol_29` AS DATETIME), MICROSECOND) AS `bfcol_44` + FROM `bfcte_3` +), `bfcte_5` AS ( + SELECT + *, + `bfcol_38` AS `bfcol_52`, + `bfcol_39` AS `bfcol_53`, + `bfcol_40` AS `bfcol_54`, + `bfcol_41` AS `bfcol_55`, + `bfcol_42` AS `bfcol_56`, + `bfcol_43` AS `bfcol_57`, + `bfcol_44` AS `bfcol_58`, + TIMESTAMP_DIFF(`bfcol_39`, `bfcol_39`, MICROSECOND) AS `bfcol_59` + FROM `bfcte_4` +), `bfcte_6` AS ( + SELECT + *, + `bfcol_52` AS `bfcol_68`, + `bfcol_53` AS `bfcol_69`, + `bfcol_54` AS `bfcol_70`, + `bfcol_55` AS `bfcol_71`, + `bfcol_56` AS `bfcol_72`, + `bfcol_57` AS `bfcol_73`, + `bfcol_58` AS `bfcol_74`, + `bfcol_59` AS `bfcol_75`, + `bfcol_54` - `bfcol_54` AS `bfcol_76` + FROM `bfcte_5` +) +SELECT + `bfcol_68` AS `rowindex`, + `bfcol_69` AS `timestamp_col`, + `bfcol_70` AS `duration_col`, + `bfcol_71` AS `date_col`, + `bfcol_72` AS `date_sub_timedelta`, + `bfcol_73` AS `timestamp_sub_timedelta`, + `bfcol_74` AS `timestamp_sub_date`, + `bfcol_75` AS `date_sub_timestamp`, + `bfcol_76` AS `timedelta_sub_timedelta` +FROM `bfcte_6` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py index 59726da73b..fe9a53a558 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py @@ -16,6 +16,7 @@ import pytest from bigframes import operations as ops +import bigframes.core.expression as ex import bigframes.pandas as bpd from bigframes.testing import utils @@ -218,6 +219,34 @@ def test_add_numeric(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(bf_df.sql, "out.sql") +def test_add_string(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["string_col"]] + sql = utils._apply_binary_op(bf_df, ops.add_op, "string_col", ex.const("a")) + + snapshot.assert_match(sql, "out.sql") + + +def test_add_timedelta(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col", "date_col"]] + timedelta = pd.Timedelta(1, unit="d") + + bf_df["date_add_timedelta"] = bf_df["date_col"] + timedelta + bf_df["timestamp_add_timedelta"] = bf_df["timestamp_col"] + timedelta + bf_df["timedelta_add_date"] = timedelta + bf_df["date_col"] + bf_df["timedelta_add_timestamp"] = timedelta + bf_df["timestamp_col"] + bf_df["timedelta_add_timedelta"] = timedelta + timedelta + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_add_unsupported_raises(scalar_types_df: bpd.DataFrame): + with pytest.raises(TypeError): + utils._apply_binary_op(scalar_types_df, ops.add_op, "timestamp_col", "date_col") + + with pytest.raises(TypeError): + utils._apply_binary_op(scalar_types_df, ops.add_op, "int64_col", "string_col") + + def test_div_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "bool_col", "float64_col"]] @@ -279,6 +308,16 @@ def test_mul_numeric(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(bf_df.sql, "out.sql") +def test_mul_timedelta(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col", "int64_col", "duration_col"]] + bf_df["duration_col"] = bpd.to_timedelta(bf_df["duration_col"], unit="us") + + bf_df["timedelta_mul_numeric"] = bf_df["duration_col"] * bf_df["int64_col"] + bf_df["numeric_mul_timedelta"] = bf_df["int64_col"] * bf_df["duration_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + def test_mod_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "float64_col"]] @@ -305,3 +344,24 @@ def test_sub_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["bool_add_int"] = bf_df["bool_col"] - bf_df["int64_col"] snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_sub_timedelta(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["timestamp_col", "duration_col", "date_col"]] + bf_df["duration_col"] = bpd.to_timedelta(bf_df["duration_col"], unit="us") + + bf_df["date_sub_timedelta"] = bf_df["date_col"] - bf_df["duration_col"] + bf_df["timestamp_sub_timedelta"] = bf_df["timestamp_col"] - bf_df["duration_col"] + bf_df["timestamp_sub_date"] = bf_df["date_col"] - bf_df["date_col"] + bf_df["date_sub_timestamp"] = bf_df["timestamp_col"] - bf_df["timestamp_col"] + bf_df["timedelta_sub_timedelta"] = bf_df["duration_col"] - bf_df["duration_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_sub_unsupported_raises(scalar_types_df: bpd.DataFrame): + with pytest.raises(TypeError): + utils._apply_binary_op(scalar_types_df, ops.sub_op, "string_col", "string_col") + + with pytest.raises(TypeError): + utils._apply_binary_op(scalar_types_df, ops.sub_op, "int64_col", "string_col") diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index ba6310507d..01852beb9c 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -1516,6 +1516,68 @@ def aggregate(self, func, **kwargs): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def corr( + self, + *, + numeric_only: bool = False, + ): + """ + Compute pairwise correlation of columns, excluding NA/null values. + + **Examples:** + + + >>> df = bpd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600], + ... 'C': [0.8, 0.4, 0.9]}) + >>> df.corr(numeric_only=True) + A B C + A 1.0 1.0 0.188982 + B 1.0 1.0 0.188982 + C 0.188982 0.188982 1.0 + + [3 rows x 3 columns] + + Args: + numeric_only(bool, default False): + Include only float, int, boolean, decimal data. + + Returns: + bigframes.pandas.DataFrame: Correlation matrix. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def cov( + self, + *, + numeric_only: bool = False, + ): + """ + Compute pairwise covariance of columns, excluding NA/null values. + + **Examples:** + + + >>> df = bpd.DataFrame({'A': [1, 2, 3], + ... 'B': [400, 500, 600], + ... 'C': [0.8, 0.4, 0.9]}) + >>> df.cov(numeric_only=True) + A B C + A 1.0 100.0 0.05 + B 100.0 10000.0 5.0 + C 0.05 5.0 0.07 + + [3 rows x 3 columns] + + Args: + numeric_only(bool, default False): + Include only float, int, boolean, decimal data. + + Returns: + bigframes.pandas.DataFrame: The covariance matrix of the series of the DataFrame. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def nunique(self): """ Return DataFrame with counts of unique elements in each position. diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index e120dabc66..d21056a8cf 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -366,6 +366,36 @@ def T(self) -> Index: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def str(self): + """ + Vectorized string functions for Series and Index. + + NAs stay NA unless handled otherwise by a particular method. Patterned + after Python’s string methods, with some inspiration from R’s stringr package. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> s = bpd.Series(["A_Str_Series"]) + >>> s + 0 A_Str_Series + dtype: string + + >>> s.str.lower() + 0 a_str_series + dtype: string + + >>> s.str.replace("_", "") + 0 AStrSeries + dtype: string + + Returns: + bigframes.operations.strings.StringMethods: + An accessor containing string methods. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def copy( self, name=None, diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 6fe84df0ab..4e319dd41d 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.26.0" +__version__ = "2.27.0" # {x-release-please-start-date} -__release_date__ = "2025-10-17" +__release_date__ = "2025-10-24" # {x-release-please-end}