From 168ebb803416adcd1d14dcf961d45d0c697c9b36 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 22 Mar 2024 23:23:07 +0000 Subject: [PATCH 1/4] fix: Product operation produces float result for all input types --- bigframes/core/compile/aggregate_compiler.py | 2 +- bigframes/operations/aggregations.py | 4 ++-- tests/system/small/test_groupby.py | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 9c1db0f162..ae21243506 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -190,7 +190,7 @@ def _( .else_(magnitude * pow(-1, negative_count_parity)) .end() ) - return float_result.cast(column.type()) # type: ignore + return float_result @compile_unary_agg.register diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 9a270f1ce7..296d5c93b0 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -140,9 +140,9 @@ class ProductOp(UnaryAggregateOp): def output_type(self, *input_types: dtypes.ExpressionType): if pd.api.types.is_bool_dtype(input_types[0]): - return dtypes.INT_DTYPE + return dtypes.FLOAT_DTYPE else: - return input_types[0] + return dtypes.FLOAT_DTYPE @dataclasses.dataclass(frozen=True) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index e7ecbedfc2..ba79ba1ab1 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -228,8 +228,7 @@ def test_dataframe_groupby_multi_sum( (lambda x: x.cumsum(numeric_only=True)), (lambda x: x.cummax(numeric_only=True)), (lambda x: x.cummin(numeric_only=True)), - # pandas 2.2 uses floating point for cumulative product even for - # integer inputs. + # Pre-pandas 2.2 doesn't always proeduce float. (lambda x: x.cumprod().astype("Float64")), (lambda x: x.shift(periods=2)), ], From e1c76a2516d251fc4b4910ef715611fb70a3aa6f Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 22 Mar 2024 23:24:00 +0000 Subject: [PATCH 2/4] simplify output_type def --- bigframes/operations/aggregations.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 296d5c93b0..76aa2a6112 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -139,10 +139,7 @@ class ProductOp(UnaryAggregateOp): name: ClassVar[str] = "product" def output_type(self, *input_types: dtypes.ExpressionType): - if pd.api.types.is_bool_dtype(input_types[0]): - return dtypes.FLOAT_DTYPE - else: - return dtypes.FLOAT_DTYPE + return dtypes.FLOAT_DTYPE @dataclasses.dataclass(frozen=True) From 243fe08468ed789319c539ee6cdb664f12cbe69c Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 23 Mar 2024 00:38:55 +0000 Subject: [PATCH 3/4] amend test_groupby_prod to expect float --- tests/system/small/test_series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 8847753e88..258fb1cfd8 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1481,7 +1481,7 @@ def test_groupby_prod(scalars_dfs): bf_series = scalars_df[col_name].groupby(scalars_df["int64_col"]).prod() pd_series = ( scalars_pandas_df[col_name].groupby(scalars_pandas_df["int64_col"]).prod() - ) + ).astype(pd.Float64Dtype()) # TODO(swast): Update groupby to use index based on group by key(s). bf_result = bf_series.to_pandas() assert_series_equal( From 9d30b05b9fc1cb0858bd37c813adfb319909a05d Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 25 Mar 2024 17:03:20 +0000 Subject: [PATCH 4/4] update df.cumprod doc example --- third_party/bigframes_vendored/pandas/core/frame.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index d70d3827e7..73a8e05f30 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2865,6 +2865,7 @@ def cov(self, *, numeric_only) -> DataFrame: Returns: DataFrame: The covariance matrix of the series of the DataFrame. """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def update( self, other, join: str = "left", overwrite: bool = True, filter_func=None @@ -4415,10 +4416,10 @@ def cumprod(self) -> DataFrame: [3 rows x 2 columns] >>> df.cumprod() - A B - 0 3 1 - 1 3 2 - 2 6 6 + A B + 0 3.0 1.0 + 1 3.0 2.0 + 2 6.0 6.0 [3 rows x 2 columns]