10000 TST: add method/dtype coverage to str-accessor; precursor to #23167 by h-vetinari · Pull Request #23582 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

TST: add method/dtype coverage to str-accessor; precursor to #23167 #23582

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Nov 28, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Unify test_str_accessor_api_for_categorical with parametrized tests
  • Loading branch information
h-vetinari committed Nov 15, 2018
commit 41cecb9dd9e17db0a2c701c1dc0ac1bbd5e402df
76 changes: 0 additions & 76 deletions pandas/tests/series/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,82 +602,6 @@ def f():
ordered=True))
tm.assert_series_equal(result, expected)

def test_str_accessor_api_for_categorical(self):
# https://github.com/pandas-dev/pandas/issues/10661
from pandas.core.strings import StringMethods
s = Series(list('aabb'))
s = s + " " + s
c = s.astype('category')
assert isinstance(c.str, StringMethods)

# str functions, which need special arguments
special_func_defs = [
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The full list of these functions & arg-combinations is reflected in the any_string_method-fixture of test_strings.py now.

('cat', (list("zyxw"),), {"sep": ","}),
('center', (10,), {}),
('contains', ("a",), {}),
('count', ("a",), {}),
('decode', ("UTF-8",), {}),
('encode', ("UTF-8",), {}),
('endswith', ("a",), {}),
('extract', ("([a-z]*) ",), {"expand": False}),
('extract', ("([a-z]*) ",), {"expand": True}),
('extractall', ("([a-z]*) ",), {}),
('find', ("a",), {}),
('findall', ("a",), {}),
('index', (" ",), {}),
('ljust', (10,), {}),
('match', ("a"), {}), # deprecated...
('normalize', ("NFC",), {}),
('pad', (10,), {}),
('partition', (" ",), {"expand": False}), # not default
('partition', (" ",), {"expand": True}), # default
('repeat', (3,), {}),
('replace', ("a", "z"), {}),
('rfind', ("a",), {}),
('rindex', (" ",), {}),
('rjust', (10,), {}),
('rpartition', (" ",), {"expand": False}), # not default
('rpartition', (" ",), {"expand": True}), # default
('slice', (0, 1), {}),
('slice_replace', (0, 1, "z"), {}),
('split', (" ",), {"expand": False}), # default
('split', (" ",), {"expand": True}), # not default
('startswith', ("a",), {}),
('wrap', (2,), {}),
('zfill', (10,), {})
]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also added a handful more combinations to test

_special_func_names = [f[0] for f in special_func_defs]

# * get, join: they need a individual elements of type lists, but
# we can't make a categorical with lists as individual categories.
# -> `s.str.split(" ").astype("category")` will error!
# * `translate` has different interfaces for py2 vs. py3
_ignore_names = ["get", "join", "translate"]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I also got rid of these previously "ignored" methods - meaning they're being tested now as well.


str_func_names = [f for f in dir(s.str) if not (
f.startswith("_") or
f in _special_func_names or
f in _ignore_names)]

func_defs = [(f, (), {}) for f in str_func_names]
func_defs.extend(special_func_defs)

for func, args, kwargs in func_defs:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This loop is now explicitly parametrized.

res = getattr(c.str, func)(*args, **kwargs)
exp = getattr(s.str, func)(*args, **kwargs)

if isinstance(res, DataFrame):
tm.assert_frame_equal(res, exp)
else:
tm.assert_series_equal(res, exp)

invalid = Series([1, 2, 3]).astype('category')
msg = "Can only use .str accessor with string"

with pytest.raises(AttributeError, match=msg):
invalid.str
assert not hasattr(invalid, 'str')
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part is fully tested ~100x as thoroughly by test_strings.py::TestStringMethods::test_api_per_dtype


def test_dt_accessor_api_for_categorical(self):
# https://github.com/pandas-dev/pandas/issues/10661
from pandas.core.indexes.accessors import Properties
Expand Down
146 changes: 104 additions & 42 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,45 +19,58 @@
import pandas.conftest as top_level_conftest
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

never import conftest

import pandas.core.strings as strings


def assert_series_or_index_equal(left, right):
if isinstance(left, Series):
assert_series_equal(left, right)
else: # Index
assert_index_equal(left, right)


# method names plus minimal set of arguments to call
_all_string_methods = [
('get', [0]),
('join', [',']),
('contains', ['some_pattern']),
('match', ['some_pattern']),
('count', ['some_pattern']),
('startswith', ['some_pattern']),
('endswith', ['some_pattern']),
('findall', ['some_pattern']),
('find', ['some_pattern']),
('rfind', ['some_pattern']),
# because "index"/"rindex" fail (intentionally) if the string is not found
# (and we're testing on generic data), search only for empty string
('index', ['']),
('rindex', ['']),
('extract', [r'(some_pattern)']),
('extractall', [r'(some_pattern)']),
('replace', ['some_pattern', 'other_pattern']),
('repeat', [10]),
('pad', [10]),
('center', [10]),
('ljust', [10]),
('rjust', [10]),
('zfill', [10]),
('wrap', [10]),
('encode', ['utf8']),
('decode', ['utf8']),
('translate', [{97: 100}]), # translating 'a' to 'd'
('normalize', ['NFC'])
_any_string_method = [
('cat', (), {'sep': ','}), # noqa: E241
('cat', (Series(list('zyx')),), {'sep': ',', # noqa: E241
'join': 'left'}),
('center', (10,), {}), # noqa: E241
('contains', ('a',), {}), # noqa: E241
('count', ('a',), {}), # noqa: E241
('decode', ('UTF-8',), {}), # noqa: E241
('encode', ('UTF-8',), {}), # noqa: E241
('endswith', ('a',), {}), # noqa: E241
('extract', ('([a-z]*)',), {'expand': False}), # noqa: E241
('extract', ('([a-z]*)',), {'expand': True}), # noqa: E241
('extractall', ('([a-z]*)',), {}), # noqa: E241
('find', ('a',), {}), # noqa: E241
('findall', ('a',), {}), # noqa: E241
('get', (0,), {}), # noqa: E241
# because "index" (and "rindex") fail intentionally
# if the string is not found, search only for empty string
('index', ('',), {}), # noqa: E241
('join', (',',), {}), # noqa: E241
('ljust', (10,), {}), # noqa: E241
('match', ('a',), {}), # noqa: E241
('normalize', ('NFC',), {}), # noqa: E241
('pad', (10,), {}), # noqa: E241
('partition', (' ',), {'expand': False}), # noqa: E241
('partition', (' ',), {'expand': True}), # noqa: E241
('repeat', (3,), {}), # noqa: E241
('replace', ('a', 'z',), {}), # noqa: E241
('rfind', ('a',), {}), # noqa: E241
('rindex', ('',), {}), # noqa: E241
('rjust', (10,), {}), # noqa: E241
('rpartition', (' ',), {'expand': False}), # noqa: E241
('rpartition', (' ',), {'expand': True}), # noqa: E241
('slice', (0, 1,), {}), # noqa: E241
('slice_replace', (0, 1, 'z',), {}), # noqa: E241
('split', (' ',), {'expand': False}), # noqa: E241
('split', (' ',), {'expand': True}), # noqa: E241
('startswith', ('a',), {}), # noqa: E241
# translating unicode points of "a" to "d"
('translate', ({97: 100},), {}), # noqa: E241
('wrap', (2,), {}), # noqa: E241
('zfill', (10,), {}) # noqa: E241
] + list(zip([
# methods without positional arguments: zip with empty tuple
# methods without positional arguments: zip with empty tuple and empty dict
'cat', 'len', 'split', 'rsplit',
'partition', 'rpartition', 'get_dummies',
'slice', 'slice_replace',
Expand All @@ -67,17 +80,42 @@ def assert_series_or_index_equal(left, right):
'isalpha', 'isnumeric', 'isalnum',
'isdigit', 'isdecimal', 'isspace',
'islower', 'isupper', 'istitle'
], [tuple()] * 100))
ids, _ = zip(*_all_string_methods) # use method name as fixture-id
], [()] * 100, [{}] * 100))
ids, _, _ = zip(*_any_string_method) # use method name as fixture-id


# test that the above list captures all methods of StringMethods
missing_methods = {f for f in dir(strings.StringMethods)
if not f.startswith('_')} - set(ids)
assert not missing_methods
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sure that:

  • fixture is complete (i.e. no methods missing; as previously in test_str_accessor_api_for_categorical)
  • the reader can easily see a list of methods


@pytest.fixture(params=_all_string_methods, ids=ids)
def all_string_methods(request):

@pytest.fixture(params=_any_string_method, ids=ids)
def any_string_method(request):
"""
Fixture for all public methods of `StringMethods`

This fixture returns a tuple of the method name and a list of sample values
for the required positional arguments of that method.
This fixture returns a tuple of the method name and sample arguments
necessary to call the method.

Returns
-------
method_name : str
The name of the method in `StringMethods`
args : tuple
Sample values for the positional arguments
kwargs : dict
Sample values for the keyword arguments

Examples
--------
>>> def test_something(any_string_method):
... s = pd.Series(['a', 'b', np.nan, 'd'])
...
... method_name, args, kwargs = any_string_method
... method = getattr(s.str, method_name)
... # will not raise
... method(*args, **kwargs)
"""
return request.param

Expand Down Expand Up @@ -188,14 +226,14 @@ def test_api_mi_raises(self):
@pytest.mark.parametrize('box', [Series, Index])
def test_api_per_method(self, box, dtype,
F438 any_allowed_skipna_inferred_dtype,
all_string_methods):
any_string_method):
# this test does not check correctness of the different methods,
# just that the methods work on the specified (inferred) dtypes,
# and raise on all others

# one instance of each parametrized fixture
inferred_dtype, values = any_allowed_skipna_inferred_dtype
method_name, minimal_args = all_string_methods
method_name, args, kwargs = any_string_method

# TODO: get rid of these xfails
if (method_name not in ['encode', 'decode', 'len']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment as above

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These xfails will be gone, and then the test reads very clearly, IMO

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this test cannot be broken up as easily, because the allowed types depend on the method being checked!

Expand All @@ -220,6 +258,10 @@ def test_api_per_method(self, box, dtype,
if (method_name in ['partition', 'rpartition'] and box == Index
and inferred_dtype != 'bytes'):
pytest.xfail(reason='Method not nan-safe on Index; see GH 23558')
if (method_name == 'split' and box == Index
and inferred_dtype in ['mixed', 'mixed-integer']
and dtype == object and kwargs.get('expand', None) == True):
pytest.xfail(reason='Method not nan-safe on Index; see GH 23677')

t = box(values, dtype=dtype) # explicit dtype to avoid casting
method = getattr(t.str, method_name)
Expand All @@ -236,14 +278,34 @@ def test_api_per_method(self, box, dtype,
+ ['mixed', 'mixed-integer'] * mixed_allowed)

if inferred_dtype in allowed_types:
method(*minimal_args) # works!
method(*args, **kwargs) # works!
else:
# GH 23011, GH 23163
msg = ('Cannot use .str.{name} with values of inferred dtype '
'{inferred_dtype!r}.'.format(name=method_name,
inferred_dtype=inferred_dtype))
with tm.assert_raises_regex(TypeError, msg):
method(*minimal_args)
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)

def test_api_for_categorical(self, any_string_method):
9B80
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are the remnants of test_str_accessor_api_for_categorical after parametrization.

# https://github.com/pandas-dev/pandas/issues/10661
s = Series(list('aabb'))
s = s + " " + s
c = s.astype('category')
assert isinstance(c.str, strings.StringMethods)

method_name, args, kwargs = any_string_method

result = getattr(c.str, method_name)(*args, **kwargs)
expected = getattr(s.str, method_name)(*args, **kwargs)

if isinstance(result, DataFrame):
tm.assert_frame_equal(result, expected)
elif isinstance(result, Series):
tm.assert_series_equal(result, expected)
else:
# str.cat(others=None) returns string, for example
assert result == expected

def test_iter(self):
# GH3638
Expand Down
0