From 91adae8ec07f48f59da31303a8d39d3e8b074909 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 27 Mar 2025 00:11:03 +0000 Subject: [PATCH] feat: allow pandas.cut 'labels' parameter to accept a list of string --- bigframes/core/compile/aggregate_compiler.py | 16 ++++- bigframes/core/reshape/tile.py | 43 +++++++++--- bigframes/operations/aggregations.py | 4 +- tests/system/small/test_pandas.py | 65 +++++++++++++++---- tests/unit/test_pandas.py | 57 ++++++++++++++-- .../pandas/core/reshape/tile.py | 21 ++++-- 6 files changed, 169 insertions(+), 37 deletions(-) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index f5ce301c14..0d31798f25 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -366,7 +366,13 @@ def _( for this_bin in range(op.bins): if op.labels is False: value = compile_ibis_types.literal_to_ibis_scalar( - this_bin, force_dtype=pd.Int64Dtype() + this_bin, + force_dtype=pd.Int64Dtype(), + ) + elif isinstance(op.labels, typing.Iterable): + value = compile_ibis_types.literal_to_ibis_scalar( + list(op.labels)[this_bin], + force_dtype=pd.StringDtype(storage="pyarrow"), ) else: left_adj = adj if this_bin == 0 and op.right else 0 @@ -402,7 +408,13 @@ def _( if op.labels is False: value = compile_ibis_types.literal_to_ibis_scalar( - this_bin, force_dtype=pd.Int64Dtype() + this_bin, + force_dtype=pd.Int64Dtype(), + ) + elif isinstance(op.labels, typing.Iterable): + value = compile_ibis_types.literal_to_ibis_scalar( + list(op.labels)[this_bin], + force_dtype=pd.StringDtype(storage="pyarrow"), ) else: if op.right: diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py index 7ab4642347..86ccf52408 100644 --- a/bigframes/core/reshape/tile.py +++ b/bigframes/core/reshape/tile.py @@ -20,6 +20,7 @@ import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile import pandas as pd +import bigframes.constants import bigframes.core.expression as ex import bigframes.core.ordering as order import bigframes.core.utils as utils @@ -41,17 +42,37 @@ def cut( right: typing.Optional[bool] = True, labels: typing.Union[typing.Iterable[str], bool, None] = None, ) -> bigframes.series.Series: - if labels is not None and labels is not False: + if ( + labels is not None + and labels is not False + and not isinstance(labels, typing.Iterable) + ): + raise ValueError( + "Bin labels must either be False, None or passed in as a list-like argument" + ) + if ( + isinstance(labels, typing.Iterable) + and len(list(labels)) > 0 + and not isinstance(list(labels)[0], str) + ): raise NotImplementedError( - "The 'labels' parameter must be either False or None. " - "Please provide a valid value for 'labels'." + "When using an iterable for labels, only iterables of strings are supported " + f"but found {type(list(labels)[0])}. {constants.FEEDBACK_LINK}" ) + if x.size == 0: raise ValueError("Cannot cut empty array.") if isinstance(bins, int): if bins <= 0: raise ValueError("`bins` should be a positive integer.") + if isinstance(labels, typing.Iterable): + labels = tuple(labels) + if len(labels) != bins: + raise ValueError( + f"Bin labels({len(labels)}) must be same as the value of bins({bins})" + ) + op = agg_ops.CutOp(bins, right=right, labels=labels) return x._apply_window_op(op, window_spec=window_specs.unbound()) elif isinstance(bins, typing.Iterable): @@ -64,9 +85,6 @@ def cut( elif len(list(bins)) == 0: as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple() - # To maintain consistency with pandas' behavior - right = True - labels = None elif isinstance(list(bins)[0], tuple): as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple(bins) @@ -88,8 +106,17 @@ def cut( raise ValueError("`bins` iterable should contain tuples or numerics.") if as_index.is_overlapping: - raise ValueError("Overlapping IntervalIndex is not accepted.") - elif len(as_index) == 0: + raise ValueError("Overlapping IntervalIndex is not accepted.") # TODO: test + + if isinstance(labels, typing.Iterable): + labels = tuple(labels) + if len(labels) != len(as_index): + raise ValueError( + f"Bin labels({len(labels)}) must be same as the number of bin edges" + f"({len(as_index)})" + ) + + if len(as_index) == 0: dtype = agg_ops.CutOp(bins, right=right, labels=labels).output_type() return bigframes.series.Series( [pd.NA] * len(x), diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index e509b2e21b..e3f15e67a1 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -340,7 +340,7 @@ class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? bins: typing.Union[int, Iterable] right: Optional[bool] - labels: Optional[bool] + labels: typing.Union[bool, Iterable[str], None] @property def skips_nulls(self): @@ -349,6 +349,8 @@ def skips_nulls(self): def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if self.labels is False: return dtypes.INT_DTYPE + elif isinstance(self.labels, Iterable): + return dtypes.STRING_DTYPE else: # Assumption: buckets use same numeric type if isinstance(self.bins, int): diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index b2eb9c2674..242e77ee9d 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -404,6 +404,14 @@ def _convert_pandas_category(pd_s: pd.Series): f"Input must be a pandas Series with categorical data: {pd_s.dtype}" ) + if pd.api.types.is_object_dtype(pd_s.cat.categories.dtype): + return pd_s.astype(pd.StringDtype(storage="pyarrow")) + + if not isinstance(pd_s.cat.categories.dtype, pd.IntervalDtype): + raise ValueError( + f"Must be a IntervalDtype with categorical data: {pd_s.cat.categories.dtype}" + ) + if pd_s.cat.categories.dtype.closed == "left": # type: ignore left_key = "left_inclusive" right_key = "right_exclusive" @@ -465,6 +473,17 @@ def test_cut_by_int_bins(scalars_dfs, labels, right): pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) +def test_cut_by_int_bins_w_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + labels = ["A", "B", "C", "D", "E"] + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=labels) + bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=labels) + + pd_result = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + @pytest.mark.parametrize( ("breaks", "right", "labels"), [ @@ -494,7 +513,7 @@ def test_cut_by_int_bins(scalars_dfs, labels, right): ), ], ) -def test_cut_numeric_breaks(scalars_dfs, breaks, right, labels): +def test_cut_by_numeric_breaks(scalars_dfs, breaks, right, labels): scalars_df, scalars_pandas_df = scalars_dfs pd_result = pd.cut( @@ -508,6 +527,18 @@ def test_cut_numeric_breaks(scalars_dfs, breaks, right, labels): pd.testing.assert_series_equal(bf_result, pd_result_converted) +def test_cut_by_numeric_breaks_w_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bins = [0, 5, 10, 15, 20] + labels = ["A", "B", "C", "D"] + pd_result = pd.cut(scalars_pandas_df["float64_col"], bins, labels=labels) + bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels) + + pd_result = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + @pytest.mark.parametrize( ("bins", "right", "labels"), [ @@ -534,7 +565,7 @@ def test_cut_numeric_breaks(scalars_dfs, breaks, right, labels): ), ], ) -def test_cut_with_interval(scalars_dfs, bins, right, labels): +def test_cut_by_interval_bins(scalars_dfs, bins, right, labels): scalars_df, scalars_pandas_df = scalars_dfs bf_result = bpd.cut( scalars_df["int64_too"], bins, labels=labels, right=right @@ -548,22 +579,30 @@ def test_cut_with_interval(scalars_dfs, bins, right, labels): pd.testing.assert_series_equal(bf_result, pd_result_converted) +def test_cut_by_interval_bins_w_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bins = pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]) + labels = ["A", "B", "C", "D", "E"] + pd_result = pd.cut(scalars_pandas_df["float64_col"], bins, labels=labels) + bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels) + + pd_result = _convert_pandas_category(pd_result) + pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) + + @pytest.mark.parametrize( - "bins", + ("bins", "labels"), [ - pytest.param([], id="empty_breaks"), - pytest.param( - [1], id="single_int_breaks", marks=pytest.mark.skip(reason="b/404338651") - ), - pytest.param(pd.IntervalIndex.from_tuples([]), id="empty_interval_index"), + pytest.param([], None, id="empty_breaks"), + pytest.param([1], False, id="single_int_breaks"), + pytest.param(pd.IntervalIndex.from_tuples([]), None, id="empty_interval_index"), ], ) -def test_cut_by_edge_cases_bins(scalars_dfs, bins): +def test_cut_by_edge_cases_bins(scalars_dfs, bins, labels): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas() - if isinstance(bins, list): - bins = pd.IntervalIndex.from_tuples(bins) - pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False) + bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=labels).to_pandas() + pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=labels) pd_result_converted = _convert_pandas_category(pd_result) pd.testing.assert_series_equal(bf_result, pd_result_converted) diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 64a287aaca..e8383512a6 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -91,13 +91,48 @@ def test_method_matches_session(method_name: str): assert pandas_signature.return_annotation == session_signature.return_annotation -def test_cut_raises_with_labels(): +@pytest.mark.parametrize( + ("bins", "labels", "error_message"), + [ + pytest.param( + 5, + True, + "Bin labels must either be False, None or passed in as a list-like argument", + id="true", + ), + pytest.param( + 5, + 1.5, + "Bin labels must either be False, None or passed in as a list-like argument", + id="invalid_types", + ), + pytest.param( + 2, + ["A"], + "must be same as the value of bins", + id="int_bins_mismatch", + ), + pytest.param( + [1, 2, 3], + ["A"], + "must be same as the number of bin edges", + id="iterator_bins_mismatch", + ), + ], +) +def test_cut_raises_with_invalid_labels(bins: int, labels, error_message: str): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) + with pytest.raises(ValueError, match=error_message): + bigframes.pandas.cut(mock_series, bins, labels=labels) + + +def test_cut_raises_with_unsupported_labels(): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) + labels = [1, 2] with pytest.raises( - NotImplementedError, - match="The 'labels' parameter must be either False or None.", + NotImplementedError, match=r".*only iterables of strings are supported.*" ): - mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) - bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"]) + bigframes.pandas.cut(mock_series, 2, labels=labels) # type: ignore @pytest.mark.parametrize( @@ -111,11 +146,21 @@ def test_cut_raises_with_labels(): "`bins` iterable should contain tuples or numerics", id="iterable_w_wrong_type", ), + pytest.param( + [10, 3], + "left side of interval must be <= right side", + id="decreased_breaks", + ), + pytest.param( + [(1, 10), (2, 25)], + "Overlapping IntervalIndex is not accepted.", + id="overlapping_intervals", + ), ], ) def test_cut_raises_with_invalid_bins(bins: int, error_message: str): + mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) with pytest.raises(ValueError, match=error_message): - mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) bigframes.pandas.cut(mock_series, bins, labels=False) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index d911a303eb..fccaffdadf 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -31,8 +31,6 @@ def cut( age ranges. Supports binning into an equal number of bins, or a pre-specified array of bins. - ``labels=False`` implies you just want the bins back. - **Examples:** >>> import bigframes.pandas as bpd @@ -55,7 +53,16 @@ def cut( 3 {'left_exclusive': 7.5, 'right_inclusive': 10.0} dtype: struct[pyarrow] - Cut with an integer (equal-width bins) and labels=False: + Cut with the same bins, but assign them specific labels: + + >>> bpd.cut(s, bins=3, labels=["bad", "medium", "good"]) + 0 bad + 1 bad + 2 medium + 3 good + dtype: string + + `labels=False` implies you want the bins back. >>> bpd.cut(s, bins=4, labels=False) 0 0 @@ -67,7 +74,6 @@ def cut( Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex: >>> import pandas as pd - >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) >>> bpd.cut(s, bins=interval_index) 0 @@ -107,7 +113,7 @@ def cut( dtype: struct[pyarrow] Args: - x (Series): + x (bigframes.pandas.Series): The input Series to be binned. Must be 1-dimensional. bins (int, pd.IntervalIndex, Iterable): The criteria to bin by. @@ -127,10 +133,11 @@ def cut( ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` indicate (1,2], (2,3], (3,4]. This argument is ignored when `bins` is an IntervalIndex. - labels (default None): + labels (bool, Iterable, default None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the - bins. This affects the type of the output container. + bins. This affects the type of the output container. This argument is + ignored when `bins` is an IntervalIndex. If True, raises an error. Returns: bigframes.pandas.Series: