-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
BUG/PERF: groupby.transform with unobserved categories #58084
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
a52e7fe
898fd12
5311004
fb548ad
baa1b28
30013ee
3b9d27b
4221c34
8588a1e
8e669d9
0d9f89d
84f83ae
cbabce0
bcca14f
f3a3f63
58e759f
7f99b71
4364440
5d63405
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1096,6 +1096,7 @@ class GroupBy(BaseGroupBy[NDFrameT]): | |
def __init__( | ||
self, | ||
obj: NDFrameT, | ||
orig_obj: NDFrameT | None = None, | ||
keys: _KeysArgType | None = None, | ||
level: IndexLabel | None = None, | ||
grouper: ops.BaseGrouper | None = None, | ||
|
@@ -1117,6 +1118,7 @@ def __init__( | |
self.sort = sort | ||
self.group_keys = group_keys | ||
self.dropna = dropna | ||
self.orig_obj = obj if orig_obj is None else orig_obj | ||
|
||
if grouper is None: | ||
grouper, exclusions, obj = get_grouper( | ||
|
@@ -1879,24 +1881,70 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): | |
|
||
else: | ||
# i.e. func in base.reduction_kernels | ||
if self.observed: | ||
return self._reduction_kernel_transform( | ||
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs | ||
) | ||
|
||
# GH#30918 Use _transform_fast only when we know func is an aggregation | ||
# If func is a reduction, we need to broadcast the | ||
# result to the whole group. Compute func result | ||
# and deal with possible broadcasting below. | ||
with com.temp_setattr(self, "as_index", True): | ||
# GH#49834 - result needs groups in the index for | ||
# _wrap_transform_fast_result | ||
if func in ["idxmin", "idxmax"]: | ||
func = cast(Literal["idxmin", "idxmax"], func) | ||
result = self._idxmax_idxmin(func, True, *args, **kwargs) | ||
else: | ||
if engine is not None: | ||
kwargs["engine"] = engine | ||
kwargs["engine_kwargs"] = engine_kwargs | ||
result = getattr(self, func)(*args, **kwargs) | ||
grouper, exclusions, obj = get_grouper( | ||
self.orig_obj, | ||
self.keys, | ||
level=self.level, | ||
sort=self.sort, | ||
observed=True, | ||
dropna=self.dropna, | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we'll want to cache this on the groupby instance - we do not want to have to recompute it if the groupby is reused. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. resolved, the group by init now accepts |
||
exclusions = frozenset(exclusions) if exclusions else frozenset() | ||
obj_has_not_changed = self.orig_obj.equals(self.obj) | ||
|
||
with ( | ||
com.temp_setattr(self, "observed", True), | ||
com.temp_setattr(self, "_grouper", grouper), | ||
com.temp_setattr(self, "exclusions", exclusions), | ||
com.temp_setattr(self, "obj", obj, condition=obj_has_not_changed), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can't we unconditionally set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. resolved, removed setting obj to obj |
||
): | ||
return self._reduction_kernel_transform( | ||
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs | ||
) | ||
|
||
# with com.temp_setattr(self, "as_index", True): | ||
# # GH#49834 - result needs groups in the index for | ||
# # _wrap_transform_fast_result | ||
# if func in ["idxmin", "idxmax"]: | ||
# func = cast(Literal["idxmin", "idxmax"], func) | ||
# result = self._idxmax_idxmin(func, True, *args, **kwargs) | ||
# else: | ||
# if engine is not None: | ||
# kwargs["engine"] = engine | ||
# kwargs["engine_kwargs"] = engine_kwargs | ||
# result = getattr(self, func)(*args, **kwargs) | ||
|
||
# print("result with observed = False\n", result.to_string()) | ||
# r = self._wrap_transform_fast_result(result) | ||
# print("reindexed result", r.to_string()) | ||
# return r | ||
|
||
@final | ||
def _reduction_kernel_transform( | ||
self, func, *args, engine=None, engine_kwargs=None, **kwargs | ||
): | ||
# GH#30918 Use _transform_fast only when we know func is an aggregation | ||
# If func is a reduction, we need to broadcast the | ||
# result to the whole group. Compute func result | ||
# and deal with possible broadcasting below. | ||
with com.temp_setattr(self, "as_index", True): | ||
# GH#49834 - result needs groups in the index for | ||
# _wrap_transform_fast_result | ||
if func in ["idxmin", "idxmax"]: | ||
func = cast(Literal["idxmin", "idxmax"], func) | ||
result = self._idxmax_idxmin(func, True, *args, **kwargs) | ||
else: | ||
if engine is not None: | ||
kwargs["engine"] = engine | ||
kwargs["engine_kwargs"] = engine_kwargs | ||
result = getattr(self, func)(*args, **kwargs) | ||
|
||
return self._wrap_transform_fast_result(result) | ||
return self._wrap_transform_fast_result(result) | ||
|
||
@final | ||
def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you revert this line addition
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This still appears in the diff of this PR.