From 6ee9dd3f8f3c0cfbd7b1d2a5a782b60984ee10a9 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 24 Feb 2025 20:02:45 +0000 Subject: [PATCH 1/2] [WIP] Support date series diff. Need to update rewrites --- bigframes/core/compile/aggregate_compiler.py | 22 ++++++++++++++++ bigframes/core/compile/constants.py | 27 ++++++++++++++++++++ bigframes/core/compile/scalar_op_compiler.py | 14 +--------- bigframes/operations/aggregations.py | 16 +++++++++++- 4 files changed, 65 insertions(+), 14 deletions(-) create mode 100644 bigframes/core/compile/constants.py diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index a17b69815c..84fc29a485 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -26,6 +26,7 @@ import bigframes_vendored.ibis.expr.types as ibis_types import pandas as pd +from bigframes.core.compile import constants import bigframes.core.compile.ibis_types as compile_ibis_types import bigframes.core.compile.scalar_op_compiler as scalar_compilers import bigframes.core.expression as ex @@ -575,6 +576,27 @@ def _( return original_column.delta(shifted_column, part="microsecond") +@compile_unary_agg.register +def _( + op: agg_ops.TimeSeriesDiffOp, + column: ibis_types.Column, + window=None, +) -> ibis_types.Value: + if not column.type().is_date(): + raise TypeError(f"Cannot perform date series diff on type{column.type()}") + + original_column = cast(ibis_types.DateColumn, column) + shifted_column = cast( + ibis_types.DateColumn, + compile_unary_agg(agg_ops.ShiftOp(op.periods), column, window), + ) + + return ( + original_column.delta(shifted_column, part="day") + * constants.UNIT_TO_US_CONVERSION_FACTORS["D"] + ).floor() + + @compile_unary_agg.register def _( op: agg_ops.AllOp, diff --git a/bigframes/core/compile/constants.py b/bigframes/core/compile/constants.py new file mode 100644 index 0000000000..9c307125ab --- /dev/null +++ b/bigframes/core/compile/constants.py @@ -0,0 +1,27 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Datetime constants +UNIT_TO_US_CONVERSION_FACTORS = { + "W": 7 * 24 * 60 * 60 * 1000 * 1000, + "d": 24 * 60 * 60 * 1000 * 1000, + "D": 24 * 60 * 60 * 1000 * 1000, + "h": 60 * 60 * 1000 * 1000, + "m": 60 * 1000 * 1000, + "s": 1000 * 1000, + "ms": 1000, + "us": 1, + "ns": 1e-3, +} diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 7111406646..9af6a5c0b9 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -26,6 +26,7 @@ import numpy as np import pandas as pd +from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS import bigframes.core.compile.default_ordering import bigframes.core.compile.ibis_types import bigframes.core.expression as ex @@ -50,19 +51,6 @@ ) _OBJ_REF_IBIS_DTYPE = ibis_dtypes.Struct.from_tuples(_OBJ_REF_STRUCT_SCHEMA) # type: ignore -# Datetime constants -UNIT_TO_US_CONVERSION_FACTORS = { - "W": 7 * 24 * 60 * 60 * 1000 * 1000, - "d": 24 * 60 * 60 * 1000 * 1000, - "D": 24 * 60 * 60 * 1000 * 1000, - "h": 60 * 60 * 1000 * 1000, - "m": 60 * 1000 * 1000, - "s": 1000 * 1000, - "ms": 1000, - "us": 1, - "ns": 1e-3, -} - class ScalarOpCompiler: # Mapping of operation name to implemenations diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index bf6016bb2e..a714f5804c 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -500,7 +500,7 @@ def skips_nulls(self): return False def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - if dtypes.is_datetime_like(input_types[0]): + if dtypes.is_date_like(input_types[0]): return dtypes.TIMEDELTA_DTYPE return super().output_type(*input_types) @@ -519,6 +519,20 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT raise TypeError(f"expect datetime-like types, but got {input_types[0]}") +@dataclasses.dataclass(frozen=True) +class DateSeriesDiffOp(UnaryWindowOp): + periods: int + + @property + def skips_nulls(self): + return False + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] == dtypes.DATE_DTYPE: + return dtypes.TIMEDELTA_DTYPE + raise TypeError(f"expect date type, but got {input_types[0]}") + + @dataclasses.dataclass(frozen=True) class AllOp(UnaryAggregateOp): name: ClassVar[str] = "all" From 09199ea904096741fd3418937d606699ec6c6440 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 24 Feb 2025 22:05:30 +0000 Subject: [PATCH 2/2] add tests --- bigframes/core/compile/aggregate_compiler.py | 11 +++++--- bigframes/core/rewrite/timedeltas.py | 13 ++++++--- tests/system/small/operations/test_dates.py | 28 ++++++++++++++++++++ 3 files changed, 44 insertions(+), 8 deletions(-) create mode 100644 tests/system/small/operations/test_dates.py diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 84fc29a485..edf1e14b3a 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -26,7 +26,7 @@ import bigframes_vendored.ibis.expr.types as ibis_types import pandas as pd -from bigframes.core.compile import constants +from bigframes.core.compile import constants as compiler_constants import bigframes.core.compile.ibis_types as compile_ibis_types import bigframes.core.compile.scalar_op_compiler as scalar_compilers import bigframes.core.expression as ex @@ -578,7 +578,7 @@ def _( @compile_unary_agg.register def _( - op: agg_ops.TimeSeriesDiffOp, + op: agg_ops.DateSeriesDiffOp, column: ibis_types.Column, window=None, ) -> ibis_types.Value: @@ -591,9 +591,12 @@ def _( compile_unary_agg(agg_ops.ShiftOp(op.periods), column, window), ) + conversion_factor = typing.cast( + ibis_types.IntegerValue, compiler_constants.UNIT_TO_US_CONVERSION_FACTORS["D"] + ) + return ( - original_column.delta(shifted_column, part="day") - * constants.UNIT_TO_US_CONVERSION_FACTORS["D"] + original_column.delta(shifted_column, part="day") * conversion_factor ).floor() diff --git a/bigframes/core/rewrite/timedeltas.py b/bigframes/core/rewrite/timedeltas.py index e21e0b6bf2..345a57ab89 100644 --- a/bigframes/core/rewrite/timedeltas.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -215,10 +215,15 @@ def _rewrite_aggregation( else: input_type = aggregation.arg.dtype - if isinstance(aggregation.op, aggs.DiffOp) and dtypes.is_datetime_like(input_type): - return ex.UnaryAggregation( - aggs.TimeSeriesDiffOp(aggregation.op.periods), aggregation.arg - ) + if isinstance(aggregation.op, aggs.DiffOp): + if dtypes.is_datetime_like(input_type): + return ex.UnaryAggregation( + aggs.TimeSeriesDiffOp(aggregation.op.periods), aggregation.arg + ) + elif input_type == dtypes.DATE_DTYPE: + return ex.UnaryAggregation( + aggs.DateSeriesDiffOp(aggregation.op.periods), aggregation.arg + ) if isinstance(aggregation.op, aggs.StdOp) and input_type is dtypes.TIMEDELTA_DTYPE: return ex.UnaryAggregation( diff --git a/tests/system/small/operations/test_dates.py b/tests/system/small/operations/test_dates.py new file mode 100644 index 0000000000..f957879d8b --- /dev/null +++ b/tests/system/small/operations/test_dates.py @@ -0,0 +1,28 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas.testing + +from bigframes import dtypes + + +def test_date_series_diff_agg(scalars_dfs): + bf_df, pd_df = scalars_dfs + + actual_result = bf_df["date_col"].diff().to_pandas() + + expected_result = pd_df["date_col"].diff().astype(dtypes.TIMEDELTA_DTYPE) + pandas.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + )