-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
Fix 'observed' kwarg not doing anything on SeriesGroupBy #26463
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
a5d6d1a
41f49f4
2575c41
1c02d9f
7350472
0a949d5
0e9f473
1ef54f4
cd481ad
a515caf
ff42dd7
c22875c
cc0b725
629a144
e4fda22
8cfa4a1
db176de
d520952
3591dbc
f97c8a1
d5c9c40
ad16db8
7c525a1
e6bca5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
from pandas import DataFrame, MultiIndex | ||
from pandas import DataFrame, CategoricalIndex, Index, MultiIndex | ||
from pandas.util import testing as tm | ||
|
||
|
||
|
@@ -76,3 +76,49 @@ def three_group(): | |
'D': np.random.randn(11), | ||
'E': np.random.randn(11), | ||
'F': np.random.randn(11)}) | ||
|
||
|
||
@pytest.fixture | ||
def df_cat(): | ||
df = DataFrame({'a': ['x', 'x', 'x', 'y'], | ||
'b': ['a', 'a', 'b', 'a'], | ||
'c': [1, 2, 3, 4]}) | ||
df['a'] = df['a'].astype('category') | ||
df['b'] = df['b'].astype('category') | ||
return df | ||
|
||
|
||
@pytest.fixture | ||
def multi_index_cat_complete(): | ||
lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), | ||
CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] | ||
index = MultiIndex.from_product(lvls, names=['a', 'b']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since 'a' and 'b' are categories can you use distinct names? 'foo' and 'bar' are fine |
||
return index | ||
|
||
|
||
@pytest.fixture | ||
def multi_index_cat_partial(df_cat): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question on naming - what does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm OK. I find this rather confusing though that the determination as to whether or not this is "partial" is done entirely outside the scope of the fixture (i.e. it is up to the injected test to only partially align). Is there not a way to make the fixture self contained to provide a Series or Frame instead? May require a rewrite of your tests but I think this is just confusing |
||
return MultiIndex.from_frame(df_cat[['a', 'b']].drop_duplicates()) | ||
|
||
|
||
@pytest.fixture | ||
def multi_index_non_cat_partial(): | ||
return MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')], | ||
names=('a', 'b')) | ||
|
||
|
||
@pytest.fixture | ||
def multi_index_cat_compl_dict(): | ||
lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), | ||
CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), | ||
Index(['min', 'max'])] | ||
index = MultiIndex.from_product(lvls, names=['a', 'b', None]) | ||
return index | ||
|
||
|
||
@pytest.fixture | ||
def multi_index_non_cat_partial_dict(): | ||
return MultiIndex.from_tuples([('x', 'a', 'min'), ('x', 'a', 'max'), | ||
('x', 'b', 'min'), ('x', 'b', 'max'), | ||
('y', 'a', 'min'), ('y', 'a', 'max')], | ||
names=('a', 'b', None)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
from datetime import datetime | ||
from collections import OrderedDict | ||
|
||
import numpy as np | ||
import pytest | ||
|
@@ -965,65 +966,31 @@ def test_shift(fill_value): | |
assert_equal(res, expected) | ||
|
||
|
||
def test_groupby_series_observed(): | ||
@pytest.mark.parametrize("observed, index, op, data", [ | ||
(True, 'multi_index_cat_partial', 'agg', [3, 3, 4]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can add a separate decorator for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could also use the observed fixture separately There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think different values for index prevent me from doing that: I don't want The same goes for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh OK I misread those values. You can also split those out into a separate test if they should exhibit different behavior |
||
(True, 'multi_index_non_cat_partial', 'apply', [3, 3, 4]), | ||
(False, 'multi_index_cat_complete', 'agg', [3, 3, 4, np.nan]), | ||
(False, 'multi_index_cat_complete', 'apply', [3, 3, 4, np.nan]), | ||
(None, 'multi_index_cat_complete', 'agg', [3, 3, 4, np.nan]), | ||
(None, 'multi_index_cat_complete', 'apply', [3, 3, 4, np.nan])]) | ||
def test_groupby_series_observed(request, df_cat, observed, index, op, data): | ||
# GH 24880 | ||
df = DataFrame({'a': ['x', 'x', 'x', 'y'], | ||
'b': ['a', 'a', 'b', 'a'], | ||
'c': [1, 2, 3, 4]}) | ||
df['a'] = df['a'].astype('category') | ||
df['b'] = df['b'].astype('category') | ||
|
||
# test .agg and .apply when observed == False | ||
lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), | ||
CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False)] | ||
index, _ = MultiIndex.from_product(lvls, names=['a', 'b']).sortlevel() | ||
expected = pd.Series(data=[3, 3, 4, np.nan], index=index, name='c') | ||
actual_agg = df.groupby(['a', 'b']).c.agg(sum) | ||
actual_apply = df.groupby(['a', 'b']).c.apply(sum) | ||
assert_series_equal(expected, actual_agg) | ||
assert_series_equal(expected, actual_apply) | ||
|
||
# test .agg when observed == True | ||
index = MultiIndex.from_frame(df[['a', 'b']].drop_duplicates()) | ||
expected = pd.Series([3, 3, 4], index=index, name='c') | ||
actual = df.groupby(['a', 'b'], observed=True).c.agg(sum) | ||
assert_series_equal(expected, actual) | ||
|
||
# test .apply when observed == True | ||
index = MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), (' D8F0 y', 'a')], | ||
names=('a', 'b')) | ||
expected = pd.Series([3, 3, 4], index=index, name='c') | ||
actual = df.groupby(['a', 'b'], observed=True).c.apply(sum) | ||
index = request.getfixturevalue(index) | ||
expected = pd.Series(data=data, index=index, name='c') | ||
grouped = df_cat.groupby(['a', 'b'], observed=observed).c | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Stylistic nit but can you use bracket notation instead of dot notation? |
||
actual = getattr(grouped, op)(sum) | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assert_series_equal(expected, actual) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you change |
||
|
||
|
||
def test_groupby_series_observed_apply_dict(): | ||
@pytest.mark.parametrize("observed, index, data", [ | ||
(True, 'multi_index_non_cat_partial_dict', [1, 2, 3, 3, 4, 4]), | ||
(False, 'multi_index_cat_compl_dict', [1, 2, 3, 3, 4, 4, np.nan, np.nan]), | ||
(None, 'multi_index_cat_compl_dict', [1, 2, 3, 3, 4, 4, np.nan, np.nan])]) | ||
def test_groupby_series_observed_apply_dict(request, df_cat, observed, index, | ||
data): | ||
# GH 24880 | ||
df = DataFrame({'a': ['x', 'x', 'x', 'y'], | ||
'b': ['a', 'a', 'b', 'a'], | ||
'c': [1, 2, 3, 4]}) | ||
df['a'] = df['a'].astype('category') | ||
df['b'] = df['b'].astype('category') | ||
|
||
# observed == False | ||
lvls = [CategoricalIndex(['x', 'y'], categories=['x', 'y'], ordered=False), | ||
CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=False), | ||
Index(['min', 'max'])] | ||
index, _ = MultiIndex.from_product(lvls, | ||
names=['a', 'b', None]).sortlevel() | ||
expected = pd.Series(data=[2, 1, 3, 3, 4, 4, np.nan, np.nan], | ||
index=index, | ||
name='c') | ||
actual = df.groupby(['a', 'b']).c.apply(lambda x: {'min': x.min(), | ||
'max': x.max()}) | ||
assert_series_equal(expected, actual) | ||
|
||
# observed == True | ||
index = MultiIndex.from_tuples([('x', 'a', 'max'), ('x', 'a', 'min'), | ||
('x', 'b', 'max'), ('x', 'b', 'min'), | ||
('y', 'a', 'max'), ('y', 'a', 'min')], | ||
names=('a', 'b', None)) | ||
expected = pd.Series(data=[2, 1, 3, 3, 4, 4], index=index, name='c') | ||
actual = df.groupby(['a', 'b'], observed=True).c.\ | ||
apply(lambda x: {'min': x.min(), 'max': x.max()}) | ||
index = request.getfixturevalue(index) | ||
expected = pd.Series(data=data, index=index, name='c') | ||
actual = df_cat.groupby(['a', 'b'], observed=observed).c.\ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also stylistic but we only use implicit line continuations in code base, so could just break after the opening paranethes to apply here if not too long |
||
apply(lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) | ||
assert_series_equal(expected, actual) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can this be more generally used in groupby/test_categorical.py?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would have to keep doing
.astype('category')
in each of my tests for this. Either that, or derive another fixture fromthree_group
, so it would not decrease the number of fixtures.I also preferred literal values to random ones for easier equality checks.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it’s not about decreasing the number of fixtures
but rather reusing them across tests as much as possible
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
literal values are fine (u could just replace the random with fixed values)
changing existing to accommodate new tests is better than rolling new