8000 PERF/BUG: use masked algo in groupby cummin and cummax by mzeitlin11 · Pull Request #40651 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

PERF/BUG: use masked algo in groupby cummin and cummax #40651

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 44 commits into from
Apr 21, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
7cd4dc3
wip
mzeitlin11 Mar 26, 2021
91984dc
wip
mzeitlin11 Mar 26, 2021
b371cc5
wip
mzeitlin11 Mar 26, 2021
69cce96
wip
mzeitlin11 Mar 26, 2021
dd7f324
wip
mzeitlin11 Mar 26, 2021
64680d4
wip
mzeitlin11 Mar 26, 2021
be16f65
wip
mzeitlin11 Mar 26, 2021
9442846
wip
mzeitlin11 Mar 26, 2021
f089175
wip
mzeitlin11 Mar 26, 2021
31409f8
wip
mzeitlin11 Mar 26, 2021
0c05f74
wip
mzeitlin11 Mar 26, 2021
18dcc94
Merge remote-tracking branch 'origin/master' into perf/masked_cummin/max
mzeitlin11 Mar 26, 2021
5c60a1f
wip
mzeitlin11 Mar 26, 2021
f0c27ce
PERF: use masked algo in groupby cummin and cummax
mzeitlin11 Mar 27, 2021
2fa80ad
Avoid mask copy
mzeitlin11 Mar 27, 2021
280c7e5
Update whatsnew
mzeitlin11 Mar 27, 2021
dca28cf
Merge remote-tracking branch 'origin/master' into perf/masked_cummin/max
mzeitlin11 Apr 1, 2021
7e2fbe0
Merge fixup
mzeitlin11 Apr 1, 2021
0ebb97a
Follow transpose
mzeitlin11 Apr 1, 2021
0009dfd
Compute mask usage inside algo
mzeitlin11 Apr 1, 2021
6663832
try optional
mzeitlin11 Apr 1, 2021
8247f82
WIP
mzeitlin11 Apr 1, 2021
71e1c4f
Use more contiguity
mzeitlin11 Apr 1, 2021
c6cf9ee
Shrink benchmark
mzeitlin11 Apr 1, 2021
02768ec
Merge remote-tracking branch 'origin/master' into perf/masked_cummin/max
mzeitlin11 Apr 1, 2021
836175b
Merge remote-tracking branch 'origin/master' into perf/masked_cummin/max
mzeitlin11 Apr 2, 2021
293dc6e
Revert unrelated
mzeitlin11 Apr 2, 2021
478c6c9
Merge remote-tracking branch 'origin/master' into perf/masked_cummin/max
mzeitlin11 Apr 6, 2021
fa45a9a
Merge remote-tracking branch 'origin/master' into perf/masked_cummin/max
mzeitlin11 Apr 8, 2021
1632b81
Merge remote-tracking branch 'origin/master' into perf/masked_cummin/max
mzeitlin11 Apr 12, 2021
1bb344e
Remove merge conflict relic
mzeitlin11 Apr 12, 2021
97d9eea
Update doc/source/whatsnew/v1.3.0.rst
mzeitlin11 Apr 13, 2021
892a92a
Update doc/source/whatsnew/v1.3.0.rst
mzeitlin11 Apr 13, 2021
a239a68
Update pandas/core/groupby/ops.py
mzeitlin11 Apr 13, 2021
f98ca35
Merge remote-tracking branch 'origin' into perf/masked_cummin/max
mzeitlin11 Apr 13, 2021
e7ed12f
Merge branch 'perf/masked_cummin/max' of github.com:/mzeitlin11/panda…
mzeitlin11 Apr 13, 2021
a1422ba
Address comments
mzeitlin11 Apr 13, 2021
482a209
Change random generation style
mzeitlin11 Apr 13, 2021
4e7404d
Merge remote-tracking branch 'origin' into perf/masked_cummin/max
mzeitlin11 Apr 18, 2021
251c02a
Use conditional instead of partial
mzeitlin11 Apr 18, 2021
3de7e5e
Remove ensure_int_or_float
mzeitlin11 Apr 18, 2021
237f86f
Remove unnecessary condition
mzeitlin11 Apr 18, 2021
a1b0c04
Merge remote-tracking branch 'origin' into perf/masked_cummin/max
mzeitlin11 Apr 19, 2021
5e1dac4
Merge remote-tracking branch 'origin' into perf/masked_cummin/max
mzeitlin11 Apr 20, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Compute mask usage inside algo
  • Loading branch information
mzeitlin11 committed Apr 1, 2021
commit 0009dfd367d6bfb74381a72a3c3e4314b39edfcc
17 changes: 5 additions & 12 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1236,7 +1236,6 @@ def group_cummin_max(groupby_t[:, ::1] out,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
bint use_mask,
bint compute_max):
"""
Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
Expand All @@ -1256,10 +1255,6 @@ def group_cummin_max(groupby_t[:, ::1] out,
Number of groups, larger than all entries of `labels`.
is_datetimelike : bool
True if `values` contains datetime-like entries.
use_mask : bool
True if the mask should be used (otherwise we continue
as if it is not a masked algorithm). Avoids the cost
of checking for a completely zeroed mask.
8000 compute_max : bool
True if cumulative maximum should be computed, False
if cumulative minimum should be computed
Expand All @@ -1273,7 +1268,9 @@ def group_cummin_max(groupby_t[:, ::1] out,
groupby_t val, mval
ndarray[groupby_t, ndim=2] accum
intp_t lab
bint val_is_nan
bint val_is_nan, use_mask

use_mask = mask is not None

N, K = (<object>values).shape
accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype)
Expand Down Expand Up @@ -1331,8 +1328,7 @@ def group_cummin(groupby_t[:, ::1] out,
uint8_t[:, ::1] mask,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
bint use_mask):
bint is_datetimelike):
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
Expand All @@ -1341,7 +1337,6 @@ def group_cummin(groupby_t[:, ::1] out,
labels,
ngroups,
is_datetimelike,
use_mask,
compute_max=False
)

Expand All @@ -1353,8 +1348,7 @@ def group_cummax(groupby_t[:, ::1] out,
uint8_t[:, ::1] mask,
const intp_t[:] labels,
int ngroups,
bint is_datetimelike,
bint use_mask):
bint is_datetimelike):
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
Expand All @@ -1363,6 +1357,5 @@ def group_cummax(groupby_t[:, ::1] out,
labels,
ngroups,
is_datetimelike,
use_mask,
compute_max=True
)
14 changes: 5 additions & 9 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,7 @@ def _cython_operation(
# can we do this operation with our cython functions
# if not raise NotImplementedError
self._disallow_invalid_ops(dtype, how, is_numeric)

func_uses_mask = cython_function_uses_mask(how)
if is_extension_array_dtype(dtype):
if isinstance(values, BaseMaskedArray) and func_uses_mask:
Expand Down Expand Up @@ -718,10 +719,6 @@ def _cython_operation(
out_shape = (ngroups,) + values.shape[1:]

func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric)
use_mask = mask is not None
if func_uses_mask:
if mask is None:
mask = np.zeros_like(values, dtype=np.uint8, order="C")

if how == "rank":
out_dtype = "float"
Expand All @@ -740,10 +737,10 @@ def _cython_operation(
elif kind == "transform":
# TODO: min_count
result = self._transform(
result, values, func, is_datetimelike, use_mask, mask, **kwargs
result, values, func, is_datetimelike, mask, func_uses_mask, **kwargs
)

if not use_mask and is_integer_dtype(result.dtype) and not is_datetimelike:
if mask is None and is_integer_dtype(result.dtype) and not is_datetimelike:
result_mask = result == iNaT
if result_mask.any():
result = result.astype("float64")
Expand Down Expand Up @@ -784,21 +781,20 @@ def _transform(
values: np.ndarray,
transform_func,
is_datetimelike: bool,
use_mask: bool,
mask: np.ndarray | None,
needs_mask: bool = False,
**kwargs,
) -> np.ndarray:

comp_ids, _, ngroups = self.group_info
if mask is not None:
if needs_mask:
transform_func(
result,
values,
mask,
comp_ids,
ngroups,
is_datetimelike,
use_mask,
**kwargs,
)
else:
Expand Down
0