-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
Cythonized GroupBy Fill #19673
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cythonized GroupBy Fill #19673
Changes from 1 commit
c58123c
2bc8023
3cb25c0
7fecc11
3c2fb36
a52b8c4
16c1823
bd3d5e0
cae65af
0266514
50dc690
9fa8e25
5da06d8
2fe91a4
825ba17
127c71c
3a23cd6
a363146
fd513c8
776d1b7
33f0d06
662008a
27e24fa
6f72476
eff6603
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -906,7 +906,7 @@ def group_cumsum(numeric[:, :] out, | |
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def group_shift_indexer(int64_t[:] out, int64_t[:] labels, | ||
def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, | ||
int ngroups, int periods): | ||
cdef: | ||
Py_ssize_t N, i, j, ii | ||
|
@@ -957,21 +957,19 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels, | |
|
||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
def group_fillna_indexer(ndarray[int64_t] out, | ||
ndarray[uint8_t] mask, | ||
ndarray[int64_t] labels, | ||
object method, | ||
def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, | ||
ndarray[uint8_t] mask, object direction, | ||
int64_t limit): | ||
"""Fills values forwards or backwards within a group | ||
"""Indexes how to fill values forwards or backwards within a group | ||
|
||
Parameters | ||
---------- | ||
out : array of int64_t values which this method will write its results to | ||
Missing values will be written to with a value of -1 | ||
mask : array of int64_t values where a 1 indicates a missing value | ||
labels : array containing unique label for each group, with its ordering | ||
matching up to the corresponding record in `values` | ||
method : {'ffill', 'bfill'} | ||
mask : array of int64_t values where a 1 indicates a missing value | ||
direction : {'ffill', 'bfill'} | ||
Direction for fill to be applied (forwards or backwards, respectively) | ||
limit : Consecutive values to fill before stopping, or -1 for no limit | ||
|
||
|
@@ -987,8 +985,11 @@ def group_fillna_indexer(ndarray[int64_t] out, | |
|
||
N = len(out) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe want an assert that N == len(labels) == len(mask) |
||
|
||
# Make sure all arrays are the same size | ||
assert N == len(labels) == len(mask) | ||
|
||
sorted_labels = np.argsort(labels).astype(np.int64, copy=False) | ||
if method == 'bfill': | ||
if direction == 'bfill': | ||
sorted_labels = sorted_labels[::-1] | ||
|
||
with nogil: | ||
|
@@ -1004,6 +1005,7 @@ def group_fillna_indexer(ndarray[int64_t] out, | |
curr_fill_idx = idx | ||
|
||
out[idx] = curr_fill_idx | ||
|
||
# If we move to the next group, reset | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line here |
||
# the fill_idx and counter | ||
if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
import types | ||
from functools import wraps | ||
from functools import wraps, partial | ||
import numpy as np | ||
import datetime | ||
import collections | ||
|
@@ -1457,25 +1457,14 @@ def expanding(self, *args, **kwargs): | |
from pandas.core.window import ExpandingGroupby | ||
return ExpandingGroupby(self, *args, **kwargs) | ||
|
||
def _fill(self, how, limit=None): | ||
labels, _, _ = self.grouper.group_info | ||
|
||
def _fill(self, direction, limit=None): | ||
# Need int value for Cython | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a doc-string |
||
if limit is None: | ||
limit = -1 | ||
output = {} | ||
if type(self) is DataFrameGroupBy: | ||
for grp in self.grouper.groupings: | ||
ser = grp.group_index.take(grp.labels) | ||
output[ser.name] = ser.values | ||
for name, obj in self._iterate_slices(): | ||
indexer = np.zeros_like(labels) | ||
mask = isnull(obj.values).view(np.uint8) | ||
libgroupby.group_fillna_indexer(indexer, mask, labels, how, | ||
limit) | ||
output[name] = algorithms.take_nd(obj.values, indexer) | ||
|
||
return self._wrap_transformed_output(output) | ||
return self._get_cythonized_result('group_fillna_indexer', | ||
self.grouper, needs_mask=True, | ||
direction=direction, limit=limit) | ||
|
||
@Substitution(name='groupby') | ||
def pad(self, limit=None): | ||
|
@@ -1863,6 +1852,52 @@ def cummax(self, axis=0, **kwargs): | |
|
||
return self._cython_transform('cummax', numeric_only=False) | ||
|
||
def _get_cythonized_result(self, how, grouper, needs_mask=False, | ||
needs_ngroups=False, **kwargs): | ||
"""Get result for Cythonized functions | ||
|
||
Parameters | ||
---------- | ||
how : str, Cythonized function name to be called | ||
grouper : Grouper object containing pertinent group info | ||
needs_mask : bool, default False | ||
Whether boolean mask needs to be part of the Cython call signature | ||
needs_ngroups : bool, default False | ||
Whether number of groups part of the Cython call signature | ||
**kwargs : dict | ||
Extra arguments required for the given function. This method | ||
internally stores an OrderedDict that maps those keywords to | ||
positional arguments before calling the Cython layer | ||
|
||
Returns | ||
------- | ||
GroupBy object populated with appropriate result(s) | ||
""" | ||
exp_kwds = collections.OrderedDict([ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these should be passed in directly (from the calling function), no? rather than hard coded inside this function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, maybe the caller passes in a partial the kwargs baked rather than the string version of the funtion?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I’ll think more about that. The one complicating factor is that masked values cannot be passed in directly as they are calculated on a per slice basis. To avoid a convoluted set of calls I figured it would make sense to have all keywords and positional arguments resolved within one function, but there’s certainly other ways to go about this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chris-b1 comment is about the exp_kwds, I am still not clear why the caller cannot pass these |
||
(('group_fillna_indexer'), ('direction', 'limit')), | ||
(('group_shift_indexer'), ('nperiods',))]) | ||
|
||
labels, _, ngroups = grouper.group_info | ||
output = collections.OrderedDict() | ||
base_func = getattr(libgroupby, how) | ||
|
||
for name, obj in self._iterate_slices(): | ||
indexer = np.zeros_like(labels) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need to specify int64? (or platform int)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, int64 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was a copy/paste of existing code, so I didn't give it too much thought. That said, |
||
func = partial(base_func, indexer, labels) | ||
if needs_mask: | ||
mask = isnull(obj.values).astype(np.uint8, copy=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Contrary to the int64 case, you do want to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the heads up - still trying to wrap my head around some of the nuances there. Out of curiosity, how do you know this? Just from experience or is there a good reference? I do see that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately all I can really point to is trial and error. Some "general" rules, at least in a pandas context
|
||
func = partial(func, mask) | ||
|
||
if needs_ngroups: | ||
func = partial(func, ngroups) | ||
|
||
# Convert any keywords into positional arguments | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can pass kwargs to cython functions, so not sure this is necessary There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I explicitly converted kwargs into positional arguments from the first comment back in #19481. If we are OK with kwargs in the Cython layer I can rewrite this, as it could simplify a few things There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, you can pass kwargs |
||
func = partial(func, *(kwargs[x] for x in exp_kwds[how])) | ||
func() # Call func to modify indexer values in place | ||
output[name] = algorithms.take_nd(obj.values, indexer) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't need .values here (I think), or maybe need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as above - believe we need the Looking at There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no this is fine |
||
|
||
return self._wrap_transformed_output(output) | ||
|
||
@Substitution(name='groupby') | ||
@Appender(_doc_template) | ||
def shift(self, periods=1, freq=None, axis=0): | ||
|
@@ -1880,17 +1915,10 @@ def shift(self, periods=1, freq=None, axis=0): | |
if freq is not None or axis != 0: | ||
return self.apply(lambda x: x.shift(periods, freq, axis)) | ||
|
||
labels, _, ngroups = self.grouper.group_info | ||
|
||
# filled in by Cython | ||
indexer = np.zeros_like(labels) | ||
libgroupby.group_shift_indexer(indexer, labels, ngroups, periods) | ||
return self._get_cythonized_result('group_shift_indexer', | ||
self.grouper, needs_ngroups=True, | ||
nperiods=periods) | ||
|
||
output = {} | ||
for name, obj in self._iterate_slices(): | ||
output[name] = algorithms.take_nd(obj.values, indexer) | ||
|
||
return self._wrap_transformed_output(output) | ||
|
||
@Substitution(name='groupby') | ||
@Appender(_doc_template) | ||
|
@@ -3597,7 +3625,6 @@ def describe(self, **kwargs): | |
def value_counts(self, normalize=False, sort=True, ascending=False, | ||
bins=None, dropna=True): | ||
|
||
from functools import partial | ||
from pandas.core.reshape.tile import cut | ||
from pandas.core.reshape.merge import _get_join_indexers | ||
|
||
|
@@ -4605,9 +4632,18 @@ def _apply_to_column_groupbys(self, func): | |
in self._iterate_column_groupbys()), | ||
keys=self._selected_obj.columns, axis=1) | ||
|
||
def _fill(self, direction, limit=None): | ||
"""Overriden method to concat grouped columns in output""" | ||
res = super()._fill(direction, limit=limit) | ||
output = collections.OrderedDict() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would be slightly simpler as a list-comprehension There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. though maybe this should be an arg to _fill? (IOW the option to do it should be done in fill) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean as an arg to |
||
for grp in self.grouper.groupings: | ||
ser = grp.group_index.take(grp.labels) | ||
output[ser.name] = ser.values | ||
|
||
return self._wrap_transformed_output(output).join(res) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here, rather than joining, could't the entries of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As is the super call already wraps it's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can do a concat no? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. concat should work just as well. Is that out of performance reasons we prefer that to join? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just idiomatic |
||
|
||
def count(self): | ||
""" Compute count of group, excluding missing values """ | ||
from functools import partial | ||
from pandas.core.dtypes.missing import _isna_ndarraylike as isna | ||
|
||
data, _ = self._get_data_to_aggregate() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just realized this is in
groupby_helper.pxi.in
, since this doesn't use templating (nor does group_shift_indexer). I think ok to omve togroupby.pxi
. can you move?