diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 29f22c28b9..c4597ab843 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -18,6 +18,7 @@ import datetime import inspect +import itertools import re import sys import textwrap @@ -70,6 +71,7 @@ import bigframes.exceptions import bigframes.formatting_helpers as formatter import bigframes.operations as ops +import bigframes.operations.aggregations import bigframes.operations.aggregations as agg_ops import bigframes.operations.plotting as plotting import bigframes.operations.structs @@ -2207,14 +2209,17 @@ def agg( self, func: str | typing.Sequence[str] ) -> DataFrame | bigframes.series.Series: if utils.is_list_like(func): - if any( - dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE - for dtype in self.dtypes - ): - raise NotImplementedError( - f"Multiple aggregations only supported on numeric columns. {constants.FEEDBACK_LINK}" - ) aggregations = [agg_ops.lookup_agg_func(f) for f in func] + + for dtype, agg in itertools.product(self.dtypes, aggregations): + if not bigframes.operations.aggregations.is_agg_op_supported( + dtype, agg + ): + raise NotImplementedError( + f"Type {dtype} does not support aggregation {agg}. " + f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}" + ) + return DataFrame( self._block.summarize( self._block.value_columns, @@ -2280,16 +2285,55 @@ def melt( self._block.melt(id_col_ids, val_col_ids, var_name, value_name) ) - def describe(self) -> DataFrame: - df_numeric = self._drop_non_numeric(permissive=False) - if len(df_numeric.columns) == 0: - raise NotImplementedError( - f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}" + _NUMERICAL_DISCRIBE_AGGS = ( + "count", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ) + _NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique") + + def describe(self, include: None | Literal["all"] = None) -> DataFrame: + if include is None: + numeric_df = self._drop_non_numeric(permissive=False) + if len(numeric_df.columns) == 0: + # Describe eligible non-numerical columns + result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS) + else: + # Otherwise, only describe numerical columns + result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS) + return typing.cast(DataFrame, result) + + elif include == "all": + numeric_result = typing.cast( + DataFrame, + self._drop_non_numeric(permissive=False).agg( + self._NUMERICAL_DISCRIBE_AGGS + ), + ) + string_result = typing.cast( + DataFrame, + self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS), ) - result = df_numeric.agg( - ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] - ) - return typing.cast(DataFrame, result) + + if len(numeric_result.columns) == 0: + return string_result + elif len(string_result.columns) == 0: + return numeric_result + else: + import bigframes.core.reshape as rs + + # Use reindex after join to preserve the original column order. + return rs.concat( + [numeric_result, string_result], axis=1 + )._reindex_columns(self.columns) + + else: + raise ValueError(f"Unsupported include type: {include}") def skew(self, *, numeric_only: bool = False): if not numeric_only: @@ -2487,7 +2531,7 @@ def unstack(self, level: LevelsType = -1): return DataFrame(pivot_block) def _drop_non_numeric(self, permissive=True) -> DataFrame: - types_to_keep = ( + numerical_types = ( set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) if permissive else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE) @@ -2495,10 +2539,18 @@ def _drop_non_numeric(self, permissive=True) -> DataFrame: non_numeric_cols = [ col_id for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) - if dtype not in types_to_keep + if dtype not in numerical_types ] return DataFrame(self._block.drop_columns(non_numeric_cols)) + def _drop_non_string(self) -> DataFrame: + string_cols = [ + col_id + for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) + if dtype == bigframes.dtypes.STRING_DTYPE + ] + return DataFrame(self._block.select_columns(string_cols)) + def _drop_non_bool(self) -> DataFrame: non_bool_cols = [ col_id diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 675ead1188..f20429e449 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -562,3 +562,14 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate return _AGGREGATIONS_LOOKUP[key] else: raise ValueError(f"Unrecognize aggregate function: {key}") + + +def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool: + if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: + return True + + if dtype == dtypes.STRING_DTYPE: + return isinstance(op, (CountOp, NuniqueOp)) + + # For all other types, support no aggregation + return False diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index b9291085fa..fe63a1ed28 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2612,6 +2612,87 @@ def test_df_describe(scalars_dfs): ).all() +@skip_legacy_pandas +@pytest.mark.parametrize("include", [None, "all"]) +def test_df_describe_non_numerical(scalars_dfs, include): + scalars_df, scalars_pandas_df = scalars_dfs + + non_numerical_columns = ["string_col"] + + modified_bf = scalars_df[non_numerical_columns] + bf_result = modified_bf.describe(include=include).to_pandas() + + modified_pd_df = scalars_pandas_df[non_numerical_columns] + pd_result = modified_pd_df.describe(include=include) + + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex(["count", "nunique"]) + pd_result = pd_result.reindex( + ["count", "unique"] + # BF counter part of "unique" is called "nunique" + ).rename(index={"unique": "nunique"}) + + pd.testing.assert_frame_equal( + pd_result[non_numerical_columns].astype("Int64"), + bf_result[non_numerical_columns], + check_index_type=False, + ) + + +@skip_legacy_pandas +def test_df_describe_mixed_types_include_all(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + numerical_columns = [ + "int64_col", + "float64_col", + ] + non_numerical_columns = ["string_col"] + supported_columns = numerical_columns + non_numerical_columns + + modified_bf = scalars_df[supported_columns] + bf_result = modified_bf.describe(include="all").to_pandas() + + modified_pd_df = scalars_pandas_df[supported_columns] + pd_result = modified_pd_df.describe(include="all") + + # Drop quartiles, as they are approximate + bf_min = bf_result.loc["min", :] + bf_p25 = bf_result.loc["25%", :] + bf_p50 = bf_result.loc["50%", :] + bf_p75 = bf_result.loc["75%", :] + bf_max = bf_result.loc["max", :] + + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex(["count", "nunique", "mean", "std", "min", "max"]) + pd_result = pd_result.reindex( + ["count", "unique", "mean", "std", "min", "max"] + # BF counter part of "unique" is called "nunique" + ).rename(index={"unique": "nunique"}) + + pd.testing.assert_frame_equal( + pd_result[numerical_columns].astype("Float64"), + bf_result[numerical_columns], + check_index_type=False, + ) + + pd.testing.assert_frame_equal( + pd_result[non_numerical_columns].astype("Int64"), + bf_result[non_numerical_columns], + check_index_type=False, + ) + + # Double-check that quantiles are at least plausible. + assert ( + (bf_min <= bf_p25) + & (bf_p25 <= bf_p50) + & (bf_p50 <= bf_p50) + & (bf_p75 <= bf_max) + ).all() + + def test_df_transpose(): # Include some floats to ensure type coercion values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] diff --git a/tests/unit/operations/__init__.py b/tests/unit/operations/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/tests/unit/operations/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/operations/test_aggregations.py b/tests/unit/operations/test_aggregations.py new file mode 100644 index 0000000000..4cb6934c9d --- /dev/null +++ b/tests/unit/operations/test_aggregations.py @@ -0,0 +1,92 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.dtypes as dtypes +from bigframes.operations.aggregations import ( + all_op, + any_op, + count_op, + dense_rank_op, + first_op, + is_agg_op_supported, + max_op, + mean_op, + median_op, + min_op, + nunique_op, + product_op, + rank_op, + size_op, + std_op, + sum_op, + var_op, +) + +_ALL_OPS = set( + [ + size_op, + sum_op, + mean_op, + median_op, + product_op, + max_op, + min_op, + std_op, + var_op, + count_op, + nunique_op, + rank_op, + dense_rank_op, + all_op, + any_op, + first_op, + ] +) +_STRING_SUPPORTED_OPS = set([count_op, nunique_op]) + + +@pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) +@pytest.mark.parametrize("op", _ALL_OPS) +def test_is_agg_op_supported_numerical_support_all(dtype, op): + assert is_agg_op_supported(dtype, op) is True + + +@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE]) +@pytest.mark.parametrize("op", _STRING_SUPPORTED_OPS) +def test_is_agg_op_supported_string_support_ops(dtype, op): + assert is_agg_op_supported(dtype, op) is True + + +@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE]) +@pytest.mark.parametrize("op", _ALL_OPS - _STRING_SUPPORTED_OPS) +def test_is_agg_op_supported_string_not_support_ops(dtype, op): + assert is_agg_op_supported(dtype, op) is False + + +@pytest.mark.parametrize( + "dtype", + [ + dtypes.BYTES_DTYPE, + dtypes.DATE_DTYPE, + dtypes.TIME_DTYPE, + dtypes.DATETIME_DTYPE, + dtypes.TIMESTAMP_DTYPE, + dtypes.GEO_DTYPE, + ], +) +@pytest.mark.parametrize("op", _ALL_OPS) +def test_is_agg_op_supported_non_numerical_no_support(dtype, op): + assert is_agg_op_supported(dtype, op) is False