Aligned group_fillna and group_shift signatures

pandas-dev · jreback · Feb 25, 2018 · Feb 12, 2018 · Feb 13, 2018 · Feb 13, 2018
commit 5da06d868dc5cf044a84a32dcf74f0924d2b919d
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -906,7 +906,7 @@ def group_cumsum(numeric[:, :] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
+def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
                         int ngroups, int periods):
     cdef:
         Py_ssize_t N, i, j, ii
@@ -957,21 +957,19 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_fillna_indexer(ndarray[int64_t] out,
-                         ndarray[uint8_t] mask,
-                         ndarray[int64_t] labels,
-                         object method,
+def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
+                         ndarray[uint8_t] mask, object direction,
                          int64_t limit):
-    """Fills values forwards or backwards within a group
+    """Indexes how to fill values forwards or backwards within a group
 
     Parameters
     ----------
     out : array of int64_t values which this method will write its results to
         Missing values will be written to with a value of -1
-    mask : array of int64_t values where a 1 indicates a missing value
     labels : array containing unique label for each group, with its ordering
         matching up to the corresponding record in `values`
-    method : {'ffill', 'bfill'}
+    mask : array of int64_t values where a 1 indicates a missing value
+    direction : {'ffill', 'bfill'}
         Direction for fill to be applied (forwards or backwards, respectively)
     limit : Consecutive values to fill before stopping, or -1 for no limit
 
@@ -987,8 +985,11 @@ def group_fillna_indexer(ndarray[int64_t] out,
 
     N = len(out)
 
+    # Make sure all arrays are the same size
+    assert N == len(labels) == len(mask)
+
     sorted_labels = np.argsort(labels).astype(np.int64, copy=False)
-    if method == 'bfill':
+    if direction == 'bfill':
         sorted_labels = sorted_labels[::-1]
 
     with nogil:
@@ -1004,6 +1005,7 @@ def group_fillna_indexer(ndarray[int64_t] out,
                 curr_fill_idx = idx
 
             out[idx] = curr_fill_idx
+
             # If we move to the next group, reset
             # the fill_idx and counter
             if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]:

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1,5 +1,5 @@
 import types
-from functools import wraps
+from functools import wraps, partial
 import numpy as np
 import datetime
 import collections
@@ -1457,25 +1457,14 @@ def expanding(self, *args, **kwargs):
         from pandas.core.window import ExpandingGroupby
         return ExpandingGroupby(self, *args, **kwargs)
 
-    def _fill(self, how, limit=None):
-        labels, _, _ = self.grouper.group_info
-
+    def _fill(self, direction, limit=None):
         # Need int value for Cython
         if limit is None:
             limit = -1
-        output = {}
-        if type(self) is DataFrameGroupBy:
-            for grp in self.grouper.groupings:
-                ser = grp.group_index.take(grp.labels)
-                output[ser.name] = ser.values
-        for name, obj in self._iterate_slices():
-            indexer = np.zeros_like(labels)
-            mask = isnull(obj.values).view(np.uint8)
-            libgroupby.group_fillna_indexer(indexer, mask, labels, how,
-                                            limit)
-            output[name] = algorithms.take_nd(obj.values, indexer)
 
-        return self._wrap_transformed_output(output)
+        return self._get_cythonized_result('group_fillna_indexer',
+                                           self.grouper, needs_mask=True,
+                                           direction=direction, limit=limit)
 
     @Substitution(name='groupby')
     def pad(self, limit=None):
@@ -1863,6 +1852,52 @@ def cummax(self, axis=0, **kwargs):
 
         return self._cython_transform('cummax', numeric_only=False)
 
+    def _get_cythonized_result(self, how, grouper, needs_mask=False,
+                               needs_ngroups=False, **kwargs):
+        """Get result for Cythonized functions
+
+        Parameters
+        ----------
+        how : str, Cythonized function name to be called
+        grouper : Grouper object containing pertinent group info
+        needs_mask : bool, default False
+            Whether boolean mask needs to be part of the Cython call signature
+        needs_ngroups : bool, default False
+            Whether number of groups part of the Cython call signature
+        **kwargs : dict
+            Extra arguments required for the given function. This method
+            internally stores an OrderedDict that maps those keywords to
+            positional arguments before calling the Cython layer
+
+        Returns
+        -------
+        GroupBy object populated with appropriate result(s)
+        """
+        exp_kwds = collections.OrderedDict([
+            (('group_fillna_indexer'), ('direction', 'limit')),
+            (('group_shift_indexer'), ('nperiods',))])
+
+        labels, _, ngroups = grouper.group_info
+        output = collections.OrderedDict()
+        base_func = getattr(libgroupby, how)
+
+        for name, obj in self._iterate_slices():
+            indexer = np.zeros_like(labels)
+            func = partial(base_func, indexer, labels)
+            if needs_mask:
+                mask = isnull(obj.values).astype(np.uint8, copy=False)
+                func = partial(func, mask)
+
+            if needs_ngroups:
+                func = partial(func, ngroups)
+
+            # Convert any keywords into positional arguments
+            func = partial(func, *(kwargs[x] for x in exp_kwds[how]))
+            func()  # Call func to modify indexer values in place
+            output[name] = algorithms.take_nd(obj.values, indexer)
+
+        return self._wrap_transformed_output(output)
+
     @Substitution(name='groupby')
     @Appender(_doc_template)
     def shift(self, periods=1, freq=None, axis=0):
@@ -1880,17 +1915,10 @@ def shift(self, periods=1, freq=None, axis=0):
         if freq is not None or axis != 0:
             return self.apply(lambda x: x.shift(periods, freq, axis))
 
-        labels, _, ngroups = self.grouper.group_info
-
-        # filled in by Cython
-        indexer = np.zeros_like(labels)
-        libgroupby.group_shift_indexer(indexer, labels, ngroups, periods)
+        return self._get_cythonized_result('group_shift_indexer',
+                                           self.grouper, needs_ngroups=True,
+                                           nperiods=periods)
 
-        output = {}
-        for name, obj in self._iterate_slices():
-            output[name] = algorithms.take_nd(obj.values, indexer)
-
-        return self._wrap_transformed_output(output)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -3597,7 +3625,6 @@ def describe(self, **kwargs):
     def value_counts(self, normalize=False, sort=True, ascending=False,
                      bins=None, dropna=True):
 
-        from functools import partial
         from pandas.core.reshape.tile import cut
         from pandas.core.reshape.merge import _get_join_indexers
 
@@ -4605,9 +4632,18 @@ def _apply_to_column_groupbys(self, func):
              in self._iterate_column_groupbys()),
             keys=self._selected_obj.columns, axis=1)
 
+    def _fill(self, direction, limit=None):
+        """Overriden method to concat grouped columns in output"""
+        res = super()._fill(direction, limit=limit)
+        output = collections.OrderedDict()
+        for grp in self.grouper.groupings:
+            ser = grp.group_index.take(grp.labels)
+            output[ser.name] = ser.values
+
+        return self._wrap_transformed_output(output).join(res)
+
     def count(self):
         """ Compute count of group, excluding missing values """
-        from functools import partial
         from pandas.core.dtypes.missing import _isna_ndarraylike as isna
 
         data, _ = self._get_data_to_aggregate()