8000 feat: Add support for temporal types in dataframe's describe() method by sycai · Pull Request #1189 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 50 additions & 13 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,17 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
)
return DataFrame(self._block.select_columns(selected_columns))

def _select_exact_dtypes(
self, dtypes: Sequence[bigframes.dtypes.Dtype]
) -> DataFrame:
"""Selects columns without considering inheritance relationships."""
columns = [
col_id
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
if dtype in dtypes
]
return DataFrame(self._block.select_columns(columns))

def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]):
self._query_job = query_job

Expand Down Expand Up @@ -2437,13 +2448,9 @@ def agg(
aggregations = [agg_ops.lookup_agg_func(f) for f in func]

for dtype, agg in itertools.product(self.dtypes, aggregations):
if not bigframes.operations.aggregations.is_agg_op_supported(
dtype, agg
):
raise NotImplementedError(
f"Type {dtype} does not support aggregation {agg}. "
f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}"
)
agg.output_type(
dtype
) # Raises exception if the agg does not support the dtype.

return DataFrame(
self._block.summarize(
Expand Down Expand Up @@ -2512,7 +2519,10 @@ def melt(

def describe(self, include: None | Literal["all"] = None) -> DataFrame:
if include is None:
numeric_df = self._drop_non_numeric(permissive=False)
numeric_df = self._select_exact_dtypes(
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+ bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
)
if len(numeric_df.columns) == 0:
# Describe eligible non-numeric columns
return self._describe_non_numeric()
Expand Down Expand Up @@ -2540,9 +2550,11 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame:
raise ValueError(f"Unsupported include type: {include}")

def _describe_numeric(self) -> DataFrame:
return typing.cast(
number_df_result = typing.cast(
DataFrame,
self._drop_non_numeric(permissive=False).agg(
self._select_exact_dtypes(
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
).agg(
[
"count",
"mean",
Expand All @@ -2555,16 +2567,41 @@ def _describe_numeric(self) -> DataFrame:
]
),
)
temporal_df_result = typing.cast(
DataFrame,
self._select_exact_dtypes(
bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
).agg(["count"]),
)

if len(number_df_result.columns) == 0:
return temporal_df_result
elif len(temporal_df_result.columns) == 0:
return number_df_result
else:
import bigframes.core.reshape.api as rs 8000

original_columns = self._select_exact_dtypes(
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+ bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
).columns

# Use reindex after join to preserve the original column order.
return rs.concat(
[number_df_result, temporal_df_result],
axis=1,
)._reindex_columns(original_columns)

def _describe_non_numeric(self) -> DataFrame:
return typing.cast(
DataFrame,
self.select_dtypes(
include={
self._select_exact_dtypes(
[
bigframes.dtypes.STRING_DTYPE,
bigframes.dtypes.BOOL_DTYPE,
bigframes.dtypes.BYTES_DTYPE,
}
bigframes.dtypes.TIME_DTYPE,
]
).agg(["count", "nunique"]),
)

Expand Down
17 changes: 13 additions & 4 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import datetime
import decimal
import typing
from typing import Dict, Literal, Union
from typing import Dict, List, Literal, Union

import bigframes_vendored.constants as constants
import geopandas as gpd # type: ignore
Expand Down Expand Up @@ -211,7 +211,7 @@ class SimpleDtypeInfo:

# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
# Pandas is inconsistent, so two definitions are provided, each used in different contexts
NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [
NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: List[Dtype] = [
FLOAT_DTYPE,
INT_DTYPE,
]
Expand All @@ -222,7 +222,16 @@ class SimpleDtypeInfo:
]


## dtype predicates - use these to maintain consistency
# Temporal types that are considered as "numeric" by Pandas
TEMPORAL_NUMERIC_BIGFRAMES_TYPES: List[Dtype] = [
DATE_DTYPE,
TIMESTAMP_DTYPE,
DATETIME_DTYPE,
]
TEMPORAL_BIGFRAMES_TYPES = TEMPORAL_NUMERIC_BIGFRAMES_TYPES + [TIME_DTYPE]


# dtype predicates - use these to maintain consistency
def is_datetime_like(type_: ExpressionType) -> bool:
return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE)

Expand Down Expand Up @@ -630,7 +639,7 @@ def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool
return True # None can be coerced to any supported type
else:
return (source_type == STRING_DTYPE) and (
target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE)
target_type in TEMPORAL_BIGFRAMES_TYPES
)


Expand Down
11 changes: 0 additions & 11 deletions bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,14 +579,3 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate
return _AGGREGATIONS_LOOKUP[key]
else:
raise ValueError(f"Unrecognize aggregate function: {key}")


def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
return True

if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE):
return isinstance(op, (CountOp, NuniqueOp))

# For all other types, support no aggregation
return False
6D40
45 changes: 37 additions & 8 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2671,11 +2671,11 @@ def test_dataframe_agg_int_multi_string(scalars_dfs):


@skip_legacy_pandas
def test_df_describe(scalars_dfs):
def test_df_describe_non_temporal(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
# pyarrows time columns fail in pandas
# excluding temporal columns here because BigFrames cannot perform percentiles operations on them
unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]
bf_result = scalars_df.describe().to_pandas()
bf_result = scalars_df.drop(columns=unsupported_columns).describe().to_pandas()

modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns)
pd_result = modified_pd_df.describe()
Expand Down Expand Up @@ -2709,12 +2709,14 @@ def test_df_describe(scalars_dfs):
def test_df_describe_non_numeric(scalars_dfs, include):
scalars_df, scalars_pandas_df = scalars_dfs

non_numeric_columns = ["string_col", "bytes_col", "bool_col"]
# Excluding "date_col" here because in BigFrames it is used as PyArrow[date32()], which is
# considered numerical in Pandas
target_columns = ["string_col", "bytes_col", "bool_col", "time_col"]

modified_bf = scalars_df[non_numeric_columns]
modified_bf = scalars_df[target_columns]
bf_result = modified_bf.describe(include=include).to_pandas()

modified_pd_df = scalars_pandas_df[non_numeric_columns]
modified_pd_df = scalars_pandas_df[target_columns]
pd_result = modified_pd_df.describe(include=include)

# Reindex results with the specified keys and their order, because
Expand All @@ -2726,8 +2728,35 @@ def test_df_describe_non_numeric(scalars_dfs, include):
).rename(index={"unique": "nunique"})

pd.testing.assert_frame_equal(
pd_result[non_numeric_columns].astype("Int64"),
bf_result[non_numeric_columns],
pd_result.astype("Int64"),
bf_result,
check_index_type=False,
)


@skip_legacy_pandas
def test_df_describe_temporal(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

temporal_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]

modified_bf = scalars_df[temporal_columns]
bf_result = modified_bf.describe(include="all").to_pandas()

modified_pd_df = scalars_pandas_df[temporal_columns]
pd_result = modified_pd_df.describe(include="all")

# Reindex results with the specified keys and their order, because
# the relative order is not important.
bf_result = bf_result.reindex(["count", "nunique"])
pd_result = pd_result.reindex(
["count", "unique"]
# BF counter part of "unique" is called "nunique"
).rename(index={"unique": "nunique"})

pd.testing.assert_frame_equal(
pd_result.astype("Float64"),
bf_result.astype("Float64"),
check_index_type=False,
)

Expand Down
83 changes: 0 additions & 83 deletions

This file was deleted.

Loading
0