BUG: groupby.agg should always agg

pandas-dev · rhshadrach · Feb 29, 2024 · Mar 2, 2024 · Mar 3, 2024 · Mar 5, 2024
commit 3505c7a4227fbff1fa7e807e33845f6d8db58e04
diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst
@@ -326,34 +326,6 @@ a milestone before tagging, you can request the bot to backport it with:
    @Meeseeksdev backport <branch>
 
 
-.. _maintaining.asv-machine:
-
-Benchmark machine
------------------
-
-The team currently owns dedicated hardware for hosting a website for pandas' ASV performance benchmark. The results
-are published to https://asv-runner.github.io/asv-collection/pandas/
-
-Configuration
-`````````````
-
-The machine can be configured with the `Ansible <http://docs.ansible.com/ansible/latest/index.html>`_ playbook in https://github.com/tomaugspurger/asv-runner.
-
-Publishing
-``````````
-
-The results are published to another GitHub repository, https://github.com/tomaugspurger/asv-collection.
-Finally, we have a cron job on our docs server to pull from https://github.com/tomaugspurger/asv-collection, to serve them from ``/speed``.
-Ask Tom or Joris for access to the webserver.
-
-Debugging
-`````````
-
-The benchmarks are scheduled by Airflow. It has a dashboard for viewing and debugging the results. You'll need to setup an SSH tunnel to view them
-
-    ssh -L 8080:localhost:8080 pandas@panda.likescandy.com
-
-
 .. _maintaining.release:
 
 Release process

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -191,6 +191,7 @@ Removal of prior version deprecations/changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
 - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)
+- :meth:`SeriesGroupBy.agg` no longer pins the name of the group to the input passed to the provided ``func`` (:issue:`51703`)
 - All arguments except ``name`` in :meth:`Index.rename` are now keyword only (:issue:`56493`)
 - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`)
 - All arguments in :meth:`Index.sort_values` are now keyword only (:issue:`56493`)
@@ -238,12 +239,12 @@ Removal of prior version deprecations/changes
 - Removed unused arguments ``*args`` and ``**kwargs`` in :class:`Resampler` methods (:issue:`50977`)
 - Unrecognized timezones when parsing strings to datetimes now raises a ``ValueError`` (:issue:`51477`)
 
-
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.performance:
 
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
+- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
 - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
 - Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
 - Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
@@ -252,11 +253,11 @@ Performance improvements
 - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
 - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
 - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
+- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
 - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`)
 - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`)
-- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
-- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`?``)
 - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
+- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_300.bug_fixes:

@@ -62,10 +62,7 @@
 )
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
-from pandas.core.groupby import (
-    base,
-    ops,
-)
+from pandas.core.groupby import base
 from pandas.core.groupby.groupby import (
     GroupBy,
     GroupByPlot,
@@ -373,34 +370,64 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
                     index=self._grouper.result_index,
                     dtype=obj.dtype,
                 )
+            return self._python_agg_general(func, *args, **kwargs)
 
-            if self._grouper.nkeys > 1:
-                return self._python_agg_general(func, *args, **kwargs)
+    agg = aggregate
 
-            try:
-                return self._python_agg_general(func, *args, **kwargs)
-            except KeyError:
-                # KeyError raised in test_groupby.test_basic is bc the func does
-                #  a dictionary lookup on group.name, but group name is not
-                #  pinned in _python_agg_general, only in _aggregate_named
-                result = self._aggregate_named(func, *args, **kwargs)
+    def _agg_for_resample(
+        self, func=None, *args, engine=None, engine_kwargs=None, **kwargs
+    ):
+        relabeling = func is None
+        columns = None
+        if relabeling:
+            columns, func = validate_func_kwargs(kwargs)
+            kwargs = {}
 
-                warnings.warn(
-                    "Pinning the groupby key to each group in "
-                    f"{type(self).__name__}.agg is deprecated, and cases that "
-                    "relied on it will raise in a future version. "
-                    "If your operation requires utilizing the groupby keys, "
-                    "iterate over the groupby object instead.",
-                    FutureWarning,
-                    stacklevel=find_stack_level(),
-                )
+        if isinstance(func, str):
+            if maybe_use_numba(engine) and engine is not None:
+                # Not all agg functions support numba, only propagate numba kwargs
+                # if user asks for numba, and engine is not None
+                # (if engine is None, the called function will handle the case where
+                # numba is requested via the global option)
+                kwargs["engine"] = engine
+            if engine_kwargs is not None:
+                kwargs["engine_kwargs"] = engine_kwargs
+            return getattr(self, func)(*args, **kwargs)
 
-                # result is a dict whose keys are the elements of result_index
-                result = Series(result, index=self._grouper.result_index)
-                result = self._wrap_aggregated_output(result)
-                return result
+        elif isinstance(func, abc.Iterable):
+            # Catch instances of lists / tuples
+            # but not the class list / tuple itself.
+            func = maybe_mangle_lambdas(func)
+            kwargs["engine"] = engine
+            kwargs["engine_kwargs"] = engine_kwargs
+            ret = self._aggregate_multiple_funcs(func, *args, **kwargs)
+            if relabeling:
+                # columns is not narrowed by mypy from relabeling flag
+                assert columns is not None  # for mypy
+                ret.columns = columns
+            if not self.as_index:
+                ret = ret.reset_index()
+            return ret
 
-    agg = aggregate
+        else:
+            if maybe_use_numba(engine):
+                return self._aggregate_with_numba(
+                    func, *args, engine_kwargs=engine_kwargs, **kwargs
+                )
+
+            if self.ngroups == 0:
+                # e.g. test_evaluate_with_empty_groups without any groups to
+                #  iterate over, we have no output on which to do dtype
+                #  inference. We default to using the existing dtype.
+                #  xref GH#51445
+                obj = self._obj_with_exclusions
+                return self.obj._constructor(
+                    [],
+                    name=self.obj.name,
+                    index=self._grouper.result_index,
+                    dtype=obj.dtype,
+                )
+            return self._python_agg_general(func, *args, **kwargs)
 
     def _python_agg_general(self, func, *args, **kwargs):
         f = lambda x: func(x, *args, **kwargs)
@@ -527,26 +554,6 @@ def _wrap_applied_output(
                 result.index = default_index(len(result))
             return result
 
-    def _aggregate_named(self, func, *args, **kwargs):
-        # Note: this is very similar to _aggregate_series_pure_python,
-        #  but that does not pin group.name
-        result = {}
-        initialized = False
-
-        for name, group in self._grouper.get_iterator(self._obj_with_exclusions):
-            # needed for pandas/tests/groupby/test_groupby.py::test_basic_aggregations
-            object.__setattr__(group, "name", name)
-
-            output = func(group, *args, **kwargs)
-            output = ops.extract_result(output)
-            if not initialized:
-                # We only do this validation on the first iteration
-                ops.check_result_array(output, group.dtype)
-                initialized = True
-            result[name] = output
-
-        return result
-
     __examples_series_doc = dedent(
         """
     >>> ser = pd.Series([390.0, 350.0, 30.0, 20.0],
@@ -1566,6 +1573,61 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
         relabeling, func, columns, order = reconstruct_func(func, **kwargs)
         func = maybe_mangle_lambdas(func)
 
+        if maybe_use_numba(engine):
+            # Not all agg functions support numba, only propagate numba kwargs
+            # if user asks for numba
+            kwargs["engine"] = engine
+            kwargs["engine_kwargs"] = engine_kwargs
+
+        op = GroupByApply(self, func, args=args, kwargs=kwargs)
+        result = op.agg()
+        if not is_dict_like(func) and result is not None:
+            # GH #52849
+            if not self.as_index and is_list_like(func):
+                return result.reset_index()
+            else:
+                return result
+        elif relabeling:
+            # this should be the only (non-raising) case with relabeling
+            # used reordered index of columns
+            result = cast(DataFrame, result)
+            result = result.iloc[:, order]
+            result = cast(DataFrame, result)
+            # error: Incompatible types in assignment (expression has type
+            # "Optional[List[str]]", variable has type
+            # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],
+            # Index, Series], Sequence[Any]]")
+            result.columns = columns  # type: ignore[assignment]
+
+        if result is None:
+            # Remove the kwargs we inserted
+            # (already stored in engine, engine_kwargs arguments)
+            if "engine" in kwargs:
+                del kwargs["engine"]
+                del kwargs["engine_kwargs"]
+            # at this point func is not a str, list-like, dict-like,
+            # or a known callable(e.g. sum)
+            if maybe_use_numba(engine):
+                return self._aggregate_with_numba(
+                    func, *args, engine_kwargs=engine_kwargs, **kwargs
+                )
+            # grouper specific aggregations
+            result = self._python_agg_general(func, *args, **kwargs)
+
+        if not self.as_index:
+            result = self._insert_inaxis_grouper(result)
+            result.index = default_index(len(result))
+
+        return result
+
+    agg = aggregate
+
+    def _agg_for_resample(
+        self, func=None, *args, engine=None, engine_kwargs=None, **kwargs
+    ):
+        relabeling, func, columns, order = reconstruct_func(func, **kwargs)
+        func = maybe_mangle_lambdas(func)
+
         if maybe_use_numba(engine):
             # Not all agg functions support numba, only propagate numba kwargs
             # if user asks for numba
@@ -1642,8 +1704,6 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
 
         return result
 
-    agg = aggregate
-
     def _python_agg_general(self, func, *args, **kwargs):
         f = lambda x: func(x, *args, **kwargs)
 

@@ -588,6 +588,7 @@ def __init__(
         self._groupings: list[grouper.Grouping] = list(groupings)
         self._sort = sort
         self.dropna = dropna
+        self._is_resample = False
 
     @property
     def groupings(self) -> list[grouper.Grouping]:
@@ -939,12 +940,14 @@ def _aggregate_series_pure_python(
 
         for i, group in enumerate(splitter):
             res = func(group)
-            res = extract_result(res)
 
-            if not initialized:
-                # We only do this validation on the first iteration
-                check_result_array(res, group.dtype)
-                initialized = True
+            if self._is_resample:
+                res = extract_result(res)
+
+                if not initialized:
+                    # We only do this validation on the first iteration
+                    check_result_array(res, group.dtype)
+                    initialized = True
 
             result[i] = res
 

diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
@@ -29,6 +29,7 @@
     doc,
 )
 
+from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.common import (
     ensure_platform_int,
     ensure_python_int,
@@ -42,6 +43,7 @@
 from pandas.core import ops
 import pandas.core.common as com
 from pandas.core.construction import extract_array
+from pandas.core.indexers import check_array_indexer
 import pandas.core.indexes.base as ibase
 from pandas.core.indexes.base import (
     Index,
@@ -1048,6 +1050,18 @@ def __getitem__(self, key):
                 "and integer or boolean "
                 "arrays are valid indices"
             )
+        elif com.is_bool_indexer(key):
+            if isinstance(getattr(key, "dtype", None), ExtensionDtype):
+                np_key = key.to_numpy(dtype=bool, na_value=False)
+            else:
+                np_key = np.asarray(key, dtype=bool)
+            check_array_indexer(self._range, np_key)  # type: ignore[arg-type]
+            # Short circuit potential _shallow_copy check
+            if np_key.all():
+                return self._simple_new(self._range, name=self.name)
+            elif not np_key.any():
+                return self._simple_new(_empty_range, name=self.name)
+            return self.take(np.flatnonzero(np_key))
         return super().__getitem__(key)
 
     def _getitem_slice(self, slobj: slice) -> Self: