-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
BUG: df.agg, df.transform and df.apply use different methods when axis=1 than when axis=0 #21224
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
3d95bfc
262bd3e
ed43757
2be3747
b6382d4
5ad024c
39ced29
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -105,6 +105,11 @@ def agg_axis(self): | |
def get_result(self): | ||
""" compute the results """ | ||
|
||
# dispatch to agg | ||
if isinstance(self.f, (list, dict)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you use is_list_like and is_dict_like |
||
return self.obj.aggregate(self.f, axis=self.axis, | ||
*self.args, **self.kwds) | ||
|
||
# all empty | ||
if len(self.columns) == 0 and len(self.index) == 0: | ||
return self.apply_empty_result() | ||
|
@@ -308,15 +313,6 @@ def wrap_results(self): | |
class FrameRowApply(FrameApply): | ||
axis = 0 | ||
|
||
def get_result(self): | ||
|
||
# dispatch to agg | ||
if isinstance(self.f, (list, dict)): | ||
return self.obj.aggregate(self.f, axis=self.axis, | ||
*self.args, **self.kwds) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
return super(FrameRowApply, self).get_result() | ||
|
||
def apply_broadcast(self): | ||
return super(FrameRowApply, self).apply_broadcast(self.obj) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6080,11 +6080,20 @@ def aggregate(self, func, axis=0, *args, **kwargs): | |
return result | ||
|
||
def _aggregate(self, arg, axis=0, *args, **kwargs): | ||
obj = self.T if axis == 1 else self | ||
return super(DataFrame, obj)._aggregate(arg, *args, **kwargs) | ||
if axis == 1: | ||
result, how = (super(DataFrame, self.T) | ||
._aggregate(arg, *args, **kwargs)) | ||
result = result.T if result is not None else result | ||
return result, how | ||
return super(DataFrame, self)._aggregate(arg, *args, **kwargs) | ||
|
||
agg = aggregate | ||
|
||
def transform(self, func, axis=0, *args, **kwargs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this doc-string get updated somewhere? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll add the Appender |
||
if axis == 1: | ||
return super(DataFrame, self.T).transform(func, *args, **kwargs).T | ||
return super(DataFrame, self).transform(func, *args, **kwargs) | ||
|
||
def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, | ||
result_type=None, args=(), **kwds): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9193,16 +9193,14 @@ def ewm(self, com=None, span=None, halflife=None, alpha=None, | |
|
||
cls.ewm = ewm | ||
|
||
@Appender(_shared_docs['transform'] % _shared_doc_kwargs) | ||
def transform(self, func, *args, **kwargs): | ||
result = self.agg(func, *args, **kwargs) | ||
if is_scalar(result) or len(result) != len(self): | ||
raise ValueError("transforms cannot produce " | ||
"aggregated results") | ||
@Appender(_shared_docs['transform'] % _shared_doc_kwargs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you need to add this appender in frame.py as well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I'll do that |
||
def transform(self, func, *args, **kwargs): | ||
result = self.agg(func, *args, **kwargs) | ||
if is_scalar(result) or len(result) != len(self): | ||
raise ValueError("transforms cannot produce " | ||
"aggregated results") | ||
|
||
return result | ||
|
||
cls.transform = transform | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when transform was added by calling |
||
return result | ||
|
||
# ---------------------------------------------------------------------- | ||
# Misc methods | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
import pytest | ||
|
||
import operator | ||
from collections import OrderedDict | ||
from datetime import datetime | ||
from itertools import chain | ||
|
||
|
@@ -846,58 +847,74 @@ def test_consistency_for_boxed(self, box): | |
assert_frame_equal(result, expected) | ||
|
||
|
||
def zip_frames(*frames): | ||
def zip_frames(frames, axis=1): | ||
""" | ||
take a list of frames, zip the columns together for each | ||
assume that these all have the first frame columns | ||
take a list of frames, zip them together under the | ||
assumption that these all have the first frames' index/columns. | ||
|
||
return a new frame | ||
Returns | ||
------- | ||
new_frame : DataFrame | ||
""" | ||
columns = frames[0].columns | ||
zipped = [f[c] for c in columns for f in frames] | ||
return pd.concat(zipped, axis=1) | ||
if axis == 1: | ||
columns = frames[0].columns | ||
zipped = [f.loc[:, c] for c in columns for f in frames] | ||
return pd.concat(zipped, axis=1) | ||
else: | ||
index = frames[0].index | ||
zipped = [f.loc[i, :] for i in index for f in frames] | ||
return pd.DataFrame(zipped) | ||
|
||
|
||
class TestDataFrameAggregate(TestData): | ||
|
||
def test_agg_transform(self): | ||
def test_agg_transform(self, axis): | ||
other_axis = abs(axis - 1) | ||
|
||
with np.errstate(all='ignore'): | ||
|
||
f_sqrt = np.sqrt(self.frame) | ||
f_abs = np.abs(self.frame) | ||
f_sqrt = np.sqrt(self.frame) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. having "absolute" come before "sqrt" maintains alphabetical ordering, and makes creating multindexes easier below. |
||
|
||
# ufunc | ||
result = self.frame.transform(np.sqrt) | ||
result = self.frame.transform(np.sqrt, axis=axis) | ||
expected = f_sqrt.copy() | ||
assert_frame_equal(result, expected) | ||
|
||
result = self.frame.apply(np.sqrt) | ||
result = self.frame.apply(np.sqrt, axis=axis) | ||
assert_frame_equal(result, expected) | ||
|
||
result = self.frame.transform(np.sqrt) | ||
result = self.frame.transform(np.sqrt, axis=axis) | ||
assert_frame_equal(result, expected) | ||
|
||
# list-like | ||
result = self.frame.apply([np.sqrt]) | ||
result = self.frame.apply([np.sqrt], axis=axis) | ||
expected = f_sqrt.copy() | ||
expected.columns = pd.MultiIndex.from_product( | ||
[self.frame.columns, ['sqrt']]) | ||
if axis == 0: | ||
expected.columns = pd.MultiIndex.from_product( | ||
[self.frame.columns, ['sqrt']]) | ||
else: | ||
expected.index = pd.MultiIndex.from_product( | ||
[self.frame.index, ['sqrt']]) | ||
assert_frame_equal(result, expected) | ||
|
||
result = self.frame.transform([np.sqrt]) | ||
result = self.frame.transform([np.sqrt], axis=axis) | ||
assert_frame_equal(result, expected) | ||
|
||
# multiple items in list | ||
# these are in the order as if we are applying both | ||
# functions per series and then concatting | ||
expected = zip_frames(f_sqrt, f_abs) | ||
expected.columns = pd.MultiIndex.from_product( | ||
[self.frame.columns, ['sqrt', 'absolute']]) | ||
result = self.frame.apply([np.sqrt, np.abs]) | ||
result = self.frame.apply([np.abs, np.sqrt], axis=axis) | ||
expected = zip_frames([f_abs, f_sqrt], axis=other_axis) | ||
if axis == 0: | ||
expected.columns = pd.MultiIndex.from_product( | ||
[self.frame.columns, ['absolute', 'sqrt']]) | ||
else: | ||
expected.index = pd.MultiIndex.from_product( | ||
[self.frame.index, ['absolute', 'sqrt']]) | ||
assert_frame_equal(result, expected) | ||
|
||
result = self.frame.transform(['sqrt', np.abs]) | ||
result = self.frame.transform([np.abs, 'sqrt'], axis=axis) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_transform_and_agg_err(self, axis): | ||
|
@@ -985,46 +1002,51 @@ def test_agg_dict_nested_renaming_depr(self): | |
|
||
def test_agg_reduce(self, axis): | ||
other_axis = abs(axis - 1) | ||
name1, name2 = self.frame.axes[other_axis].unique()[:2] | ||
name1, name2 = self.frame.axes[other_axis].unique()[:2].sort_values() | ||
|
||
# all reducers | ||
expected = zip_frames(self.frame.mean(axis=axis).to_frame(), | ||
self.frame.max(axis=axis).to_frame(), | ||
self.frame.sum(axis=axis).to_frame()).T | ||
expected.index = ['mean', 'max', 'sum'] | ||
expected = pd.concat([self.frame.mean(axis=axis), | ||
self.frame.max(axis=axis), | ||
self.frame.sum(axis=axis), | ||
], axis=1) | ||
expected.columns = ['mean', 'max', 'sum'] | ||
expected = expected.T if axis == 0 else expected | ||
|
||
result = self.frame.agg(['mean', 'max', 'sum'], axis=axis) | ||
assert_frame_equal(result, expected) | ||
|
||
# dict input with scalars | ||
func = {name1: 'mean', name2: 'sum'} | ||
func = OrderedDict([(name1, 'mean'), (name2, 'sum')]) | ||
result = self.frame.agg(func, axis=axis) | ||
expected = Series([self.frame.loc(other_axis)[name1].mean(), | ||
self.frame.loc(other_axis)[name2].sum()], | ||
index=[name1, name2]) | ||
assert_series_equal(result.reindex_like(expected), expected) | ||
assert_series_equal(result, expected) | ||
|
||
# dict input with lists | ||
func = {name1: ['mean'], name2: ['sum']} | ||
func = OrderedDict([(name1, ['mean']), (name2, ['sum'])]) | ||
result = self.frame.agg(func, axis=axis) | ||
expected = DataFrame({ | ||
name1: Series([self.frame.loc(other_axis)[name1].mean()], | ||
index=['mean']), | ||
name2: Series([self.frame.loc(other_axis)[name2].sum()], | ||
index=['sum'])}) | ||
assert_frame_equal(result.reindex_like(expected), expected) | ||
expected = expected.T if axis == 1 else expected | ||
assert_frame_equal(result, expected) | ||
|
||
# dict input with lists with multiple | ||
func = {name1: ['mean', 'sum'], | ||
name2: ['sum', 'max']} | ||
func = OrderedDict([(name1, ['mean', 'sum']), (name2, ['sum', 'max'])]) | ||
result = self.frame.agg(func, axis=axis) | ||
expected = DataFrame({ | ||
name1: Series([self.frame.loc(other_axis)[name1].mean(), | ||
expected = DataFrame(OrderedDict([ | ||
(name1, Series([self.frame.loc(other_axis)[name1].mean(), | ||
self.frame.loc(other_axis)[name1].sum()], | ||
index=['mean', 'sum']), | ||
name2: Series([self.frame.loc(other_axis)[name2].sum(), | ||
index=['mean', 'sum'])), | ||
(name2, Series([self.frame.loc(other_axis)[name2].sum(), | ||
self.frame.loc(other_axis)[name2].max()], | ||
index=['sum', 'max'])}) | ||
assert_frame_equal(result.reindex_like(expected), expected) | ||
index=['sum', 'max'])), | ||
])) | ||
expected = expected.T if axis == 1 else expected | ||
assert_frame_equal(result, expected) | ||
|
||
def test_nuiscance_columns(self): | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use double-back ticks around TypeError. you don't need the 2nd sentence, just list the issue.