E5DC BUG: incorrect rounding in groupby.cummin near int64 implementation bounds by jbrockmendel · Pull Request #40767 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Handle min/max
  • Loading branch information
jbrockmendel committed Apr 6, 2021
commit b8f5a2880d77059b152b60bbee36e734052a2ef9
32 changes: 26 additions & 6 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const intp_t[:] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint compute_max=True):
"""
Compute minimum/maximum of columns of `values`, in row groups `labels`.
Expand All @@ -1139,6 +1140,8 @@ cdef group_min_max(groupby_t[:, ::1] out,
min_count : Py_ssize_t, default -1
The minimum number of non-NA group elements, NA result if threshold
is not met
is_datetimelike : bool
True if `values` contains datetime-like entries.
compute_max : bint, default True
True to compute group-wise max, False to compute min

Expand Down Expand Up @@ -1187,8 +1190,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
for j in range(K):
val = values[i, j]

if not _treat_as_na(val, True):
# TODO: Sure we always want is_datetimelike=True?
if not _treat_as_na(val, is_datetimelike):
nobs[lab, j] += 1
if compute_max:
if val > group_min_or_max[lab, j]:
Expand Down Expand Up @@ -1220,9 +1222,18 @@ def group_max(groupby_t[:, ::1] out,
int64_t[::1] counts,
ndarray[groupby_t, ndim=2] values,
const intp_t[:] labels,
Py_ssize_t min_count=-1) -> None:
Py_ssize_t min_count=-1,
bint is_datetimelike=False) -> None:
"""See group_min_max.__doc__"""
group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True)
group_min_max(
out,
counts,
values,
labels,
min_count=min_count,
is_datetimelike=is_datetimelike,
compute_max=True,
)


@cython.wraparound(False)
Expand All @@ -1231,9 +1242,18 @@ def group_min(groupby_t[:, ::1] out,
int64_t[::1] counts,
ndarray[groupby_t, ndim=2] values,
const intp_t[:] labels,
Py_ssize_t min_count=-1) -> None:
Py_ssize_t min_count=-1,
bint is_datetimelike=False) -> None:
"""See group_min_max.__doc__"""
group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False)
group_min_max(
out,
counts,
values,
labels,
min_count=min_count,
is_datetimelike=is_datetimelike,
compute_max=False,
)


@cython.boundscheck(False)
Expand Down
16 changes: 13 additions & 3 deletions pandas/core/groupby/ops.py
10BC0
Original file line number Diff line number Diff line change
Expand Up @@ -694,19 +694,29 @@ def _cython_operation(
result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
if kind == "aggregate":
counts = np.zeros(ngroups, dtype=np.int64)
func(result, counts, values, comp_ids, min_count)
if how in ["min", "max"]:
func(
result,
counts,
values,
comp_ids,
min_count,
is_datetimelike=is_datetimelike,
)
else:
func(result, counts, values, comp_ids, min_count)
elif kind == "transform":
# TODO: min_count
func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

if kind == "aggregate":
# i.e. counts is defined
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you reference either this PR & put in an expl here, this is is really unexpected and non-obvious what is happening. ping on green.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping

if is_integer_dtype(result.dtype) and not is_datetimelike:
# TODO: we don't have any tests that get here with min_count > 1
cutoff = max(1, min_count)
empty_groups = counts < cutoff
if empty_groups.any():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this check needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if empty_groups.any() then we need to mask in order to cast to float64

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

though this behavior is really surprising, though likely not hit practically. we should move aggressively to return Int dtypes here. Yes this is a breaking change but fixes these types of value dependent behavior.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should move aggressively to return Int dtypes here

we'd need 2D support for Int dtypes for me to consider getting on board with this, xref #38992

result = result.astype("float64") # TODO: could be lossy
# Note: this conversion could be lossy, see GH#40767
result = result.astype("float64")
result[empty_groups] = np.nan

if self._filter_empty_groups and not counts.all():
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import pytest

from pandas._libs.tslibs import iNaT
from pandas.errors import UnsupportedFunctionCall

import pandas as pd
Expand Down Expand Up @@ -635,6 +636,38 @@ def test_max_nan_bug():
assert not r["File"].isna().any()


def test_max_inat():
# GH#40767 dont interpret iNaT as NaN
ser = Series([1, iNaT])
gb = ser.groupby([1, 1])

result = gb.max(min_count=2)
expected = Series({1: 1}, dtype=np.int64)
tm.assert_series_equal(result, expected, check_exact=True)

result = gb.min(min_count=2)
expected = Series({1: iNaT}, dtype=np.int64)
tm.assert_series_equal(result, expected, check_exact=True)

# not enough entries -> gets masked to NaN
result = gb.min(min_count=3)
expected = Series({1: np.nan})
tm.assert_series_equal(result, expected, check_exact=True)


def test_max_inat_not_all_na():
# GH#40767 dont interpret iNaT as NaN

# make sure we dont round iNaT+1 to iNaT
ser = Series([1, iNaT, 2, iNaT + 1])
gb = ser.groupby([1, 2, 3, 3])
result = gb.min(min_count=2)

# Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy
expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1})
tm.assert_series_equal(result, expected, check_exact=True)


def test_nlargest():
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list("a" * 5 + "b" * 5))
Expand Down
0