8000 PERF: Series(pyarrow-backed).rank by lukemanley · Pull Request #50264 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

PERF: Series(pyarrow-backed).rank #50264

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Dec 17, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
use pyarrow for method="average"
  • Loading branch information
lukemanley committed Dec 16, 2022
commit 6ba998ee8e0ebf1893d45c145f9e694e126dada6
51 changes: 27 additions & 24 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -952,61 +952,64 @@ def _indexing_key_to_indices(
return indices

def _rank(
self: ArrowExtensionArrayT,
self,
*,
axis: AxisInt = 0,
method: str = "average",
na_option: str = "keep",
ascending: bool = True,
pct: bool = False,
) -> ArrowExtensionArrayT:
):
"""
See Series.rank.__doc__.
"""
if axis != 0:
raise NotImplementedError

if (
pa_version_under9p0
# as of version 10, pyarrow does not support an "average" method
or method not in ("min", "max", "first", "dense")
):
from pandas.core.algorithms import rank

ranked = rank(
self.to_numpy(),
if pa_version_under9p0:
ranked = super().rank(
axis=axis,
method=method,
na_option=na_option,
ascending=ascending,
pct=pct,
)
if method != "average" and not pct:
pa_type = pa.uint64()
else:
# keep dtypes consistent with the implementation below
if method == "average" or pct:
pa_type = pa.float64()
else:
pa_type = pa.uint64()
result = pa.array(ranked, type=pa_type, from_pandas=True)
return type(self)(result)

sort_keys = "ascending" if ascending else "descending"
if axis != 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be combined with the above if pa_version_under9p0 or axis !=0.

Added benefit that if the base implementation ever implements axis != 0 this arrays gets it for free

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to fall though for axis != 1

raise NotImplementedError

if na_option == "top":
null_placement = "at_start"
else:
null_placement = "at_end"
data = self._data.combine_chunks()
sort_keys = "ascending" if ascending else "descending"
null_placement = "at_start" if na_option == "top" else "at_end"
tiebreaker = "min" if method == "average" else method

result = pc.rank(
self._data.combine_chunks(),
data,
sort_keys=sort_keys,
null_placement=null_placement,
tiebreaker=method,
tiebreaker=tiebreaker,
)

if na_option == "keep":
mask = pc.is_null(self._data)
null = pa.scalar(None, type=result.type)
result = pc.if_else(mask, null, result)

if method == "average":
result_max = pc.rank(
data,
sort_keys=sort_keys,
null_placement=null_placement,
tiebreaker="max",
)
result_max = result_max.cast(pa.float64())
result_min = result.cast(pa.float64())
result = pc.divide(pc.add(result_min, result_max), 2)

if pct:
if not pa.types.is_floating(result.type):
result = result.cast(pa.float64())
Expand Down
0