-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
TST: add method/dtype coverage to str-accessor; precursor to #23167 #23582
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
91b720e
dcee05a
41cecb9
01a3c10
a94f569
f0ae1db
16fe71c
78cead0
0aaa04e
9b36a50
c0752eb
a53a28e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -602,82 +602,6 @@ def f(): | |
ordered=True)) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_str_accessor_api_for_categorical(self): | ||
# https://github.com/pandas-dev/pandas/issues/10661 | ||
from pandas.core.strings import StringMethods | ||
s = Series(list('aabb')) | ||
s = s + " " + s | ||
c = s.astype('category') | ||
assert isinstance(c.str, StringMethods) | ||
|
||
# str functions, which need special arguments | ||
special_func_defs = [ | ||
('cat', (list("zyxw"),), {"sep": ","}), | ||
('center', (10,), {}), | ||
('contains', ("a",), {}), | ||
('count', ("a",), {}), | ||
('decode', ("UTF-8",), {}), | ||
('encode', ("UTF-8",), {}), | ||
('endswith', ("a",), {}), | ||
('extract', ("([a-z]*) ",), {"expand": False}), | ||
('extract', ("([a-z]*) ",), {"expand": True}), | ||
('extractall', ("([a-z]*) ",), {}), | ||
('find', ("a",), {}), | ||
('findall', ("a",), {}), | ||
('index', (" ",), {}), | ||
('ljust', (10,), {}), | ||
('match', ("a"), {}), # deprecated... | ||
('normalize', ("NFC",), {}), | ||
('pad', (10,), {}), | ||
('partition', (" ",), {"expand": False}), # not default | ||
('partition', (" ",), {"expand": True}), # default | ||
('repeat', (3,), {}), | ||
('replace', ("a", "z"), {}), | ||
('rfind', ("a",), {}), | ||
('rindex', (" ",), {}), | ||
('rjust', (10,), {}), | ||
('rpartition', (" ",), {"expand": False}), # not default | ||
('rpartition', (" ",), {"expand": True}), # default | ||
('slice', (0, 1), {}), | ||
('slice_replace', (0, 1, "z"), {}), | ||
('split', (" ",), {"expand": False}), # default | ||
('split', (" ",), {"expand": True}), # not default | ||
('startswith', ("a",), {}), | ||
('wrap', (2,), {}), | ||
('zfill', (10,), {}) | ||
] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also added a handful more combinations to test |
||
_special_func_names = [f[0] for f in special_func_defs] | ||
|
||
# * get, join: they need a individual elements of type lists, but | ||
# we can't make a categorical with lists as individual categories. | ||
# -> `s.str.split(" ").astype("category")` will error! | ||
# * `translate` has different interfaces for py2 vs. py3 | ||
_ignore_names = ["get", "join", "translate"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I also got rid of these previously "ignored" methods - meaning they're being tested now as well. |
||
|
||
str_func_names = [f for f in dir(s.str) if not ( | ||
f.startswith("_") or | ||
f in _special_func_names or | ||
f in _ignore_names)] | ||
|
||
func_defs = [(f, (), {}) for f in str_func_names] | ||
func_defs.extend(special_func_defs) | ||
|
||
for func, args, kwargs in func_defs: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This loop is now explicitly parametrized. |
||
res = getattr(c.str, func)(*args, **kwargs) | ||
exp = getattr(s.str, func)(*args, **kwargs) | ||
|
||
if isinstance(res, DataFrame): | ||
tm.assert_frame_equal(res, exp) | ||
else: | ||
tm.assert_series_equal(res, exp) | ||
|
||
invalid = Series([1, 2, 3]).astype('category') | ||
msg = "Can only use .str accessor with string" | ||
|
||
with pytest.raises(AttributeError, match=msg): | ||
invalid.str | ||
assert not hasattr(invalid, 'str') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This part is fully tested ~100x as thoroughly by |
||
|
||
def test_dt_accessor_api_for_categorical(self): | ||
# https://github.com/pandas-dev/pandas/issues/10661 | ||
from pandas.core.indexes.accessors import Properties | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,45 +19,58 @@ | |
import pandas.conftest as top_level_conftest | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. never import conftest |
||
import pandas.core.strings as strings | ||
|
||
|
||
def assert_series_or_index_equal(left, right): | ||
if isinstance(left, Series): | ||
assert_series_equal(left, right) | ||
else: # Index | ||
assert_index_equal(left, right) | ||
|
||
|
||
# method names plus minimal set of arguments to call | ||
_all_string_methods = [ | ||
('get', [0]), | ||
('join', [',']), | ||
('contains', ['some_pattern']), | ||
('match', ['some_pattern']), | ||
('count', ['some_pattern']), | ||
('startswith', ['some_pattern']), | ||
('endswith', ['some_pattern']), | ||
('findall', ['some_pattern']), | ||
('find', ['some_pattern']), | ||
('rfind', ['some_pattern']), | ||
# because "index"/"rindex" fail (intentionally) if the string is not found | ||
# (and we're testing on generic data), search only for empty string | ||
('index', ['']), | ||
('rindex', ['']), | ||
('extract', [r'(some_pattern)']), | ||
('extractall', [r'(some_pattern)']), | ||
('replace', ['some_pattern', 'other_pattern']), | ||
('repeat', [10]), | ||
('pad', [10]), | ||
('center', [10]), | ||
('ljust', [10]), | ||
('rjust', [10]), | ||
('zfill', [10]), | ||
('wrap', [10]), | ||
('encode', ['utf8']), | ||
('decode', ['utf8']), | ||
('translate', [{97: 100}]), # translating 'a' to 'd' | ||
('normalize', ['NFC']) | ||
_any_string_method = [ | ||
('cat', (), {'sep': ','}), # noqa: E241 | ||
('cat', (Series(list('zyx')),), {'sep': ',', # noqa: E241 | ||
'join': 'left'}), | ||
('center', (10,), {}), # noqa: E241 | ||
('contains', ('a',), {}), # noqa: E241 | ||
('count', ('a',), {}), # noqa: E241 | ||
('decode', ('UTF-8',), {}), # noqa: E241 | ||
('encode', ('UTF-8',), {}), # noqa: E241 | ||
('endswith', ('a',), {}), # noqa: E241 | ||
('extract', ('([a-z]*)',), {'expand': False}), # noqa: E241 | ||
('extract', ('([a-z]*)',), {'expand': True}), # noqa: E241 | ||
('extractall', ('([a-z]*)',), {}), # noqa: E241 | ||
('find', ('a',), {}), # noqa: E241 | ||
('findall', ('a',), {}), # noqa: E241 | ||
('get', (0,), {}), # noqa: E241 | ||
# because "index" (and "rindex") fail intentionally | ||
# if the string is not found, search only for empty string | ||
('index', ('',), {}), # noqa: E241 | ||
('join', (',',), {}), # noqa: E241 | ||
('ljust', (10,), {}), # noqa: E241 | ||
('match', ('a',), {}), # noqa: E241 | ||
('normalize', ('NFC',), {}), # noqa: E241 | ||
('pad', (10,), {}), # noqa: E241 | ||
('partition', (' ',), {'expand': False}), # noqa: E241 | ||
('partition', (' ',), {'expand': True}), # noqa: E241 | ||
('repeat', (3,), {}), # noqa: E241 | ||
('replace', ('a', 'z',), {}), # noqa: E241 | ||
('rfind', ('a',), {}), # noqa: E241 | ||
('rindex', ('',), {}), # noqa: E241 | ||
('rjust', (10,), {}), # noqa: E241 | ||
('rpartition', (' ',), {'expand': False}), # noqa: E241 | ||
('rpartition', (' ',), {'expand': True}), # noqa: E241 | ||
('slice', (0, 1,), {}), # noqa: E241 | ||
('slice_replace', (0, 1, 'z',), {}), # noqa: E241 | ||
('split', (' ',), {'expand': False}), # noqa: E241 | ||
('split', (' ',), {'expand': True}), # noqa: E241 | ||
('startswith', ('a',), {}), # noqa: E241 | ||
# translating unicode points of "a" to "d" | ||
('translate', ({97: 100},), {}), # noqa: E241 | ||
('wrap', (2,), {}), # noqa: E241 | ||
('zfill', (10,), {}) # noqa: E241 | ||
] + list(zip([ | ||
# methods without positional arguments: zip with empty tuple | ||
# methods without positional arguments: zip with empty tuple and empty dict | ||
'cat', 'len', 'split', 'rsplit', | ||
'partition', 'rpartition', 'get_dummies', | ||
'slice', 'slice_replace', | ||
|
@@ -67,17 +80,42 @@ def assert_series_or_index_equal(left, right): | |
'isalpha', 'isnumeric', 'isalnum', | ||
'isdigit', 'isdecimal', 'isspace', | ||
'islower', 'isupper', 'istitle' | ||
], [tuple()] * 100)) | ||
ids, _ = zip(*_all_string_methods) # use method name as fixture-id | ||
], [()] * 100, [{}] * 100)) | ||
ids, _, _ = zip(*_any_string_method) # use method name as fixture-id | ||
|
||
|
||
# test that the above list captures all methods of StringMethods | ||
missing_methods = {f for f in dir(strings.StringMethods) | ||
if not f.startswith('_')} - set(ids) | ||
assert not missing_methods | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes sure that:
|
||
|
||
@pytest.fixture(params=_all_string_methods, ids=ids) | ||
def all_string_methods(request): | ||
|
||
@pytest.fixture(params=_any_string_method, ids=ids) | ||
def any_string_method(request): | ||
""" | ||
Fixture for all public methods of `StringMethods` | ||
|
||
This fixture returns a tuple of the method name and a list of sample values | ||
for the required positional arguments of that method. | ||
This fixture returns a tuple of the method name and sample arguments | ||
necessary to call the method. | ||
|
||
Returns | ||
------- | ||
method_name : str | ||
The name of the method in `StringMethods` | ||
args : tuple | ||
Sample values for the positional arguments | ||
kwargs : dict | ||
Sample values for the keyword arguments | ||
|
||
Examples | ||
-------- | ||
>>> def test_something(any_string_method): | ||
... s = pd.Series(['a', 'b', np.nan, 'd']) | ||
... | ||
... method_name, args, kwargs = any_string_method | ||
... method = getattr(s.str, method_name) | ||
... # will not raise | ||
... method(*args, **kwargs) | ||
""" | ||
return request.param | ||
|
||
|
@@ -188,14 +226,14 @@ def test_api_mi_raises(self): | |
@pytest.mark.parametrize('box', [Series, Index]) | ||
def test_api_per_method(self, box, dtype, | ||
F438 any_allowed_skipna_inferred_dtype, | ||
all_string_methods): | ||
any_string_method): | ||
# this test does not check correctness of the different methods, | ||
# just that the methods work on the specified (inferred) dtypes, | ||
# and raise on all others | ||
|
||
# one instance of each parametrized fixture | ||
inferred_dtype, values = any_allowed_skipna_inferred_dtype | ||
method_name, minimal_args = all_string_methods | ||
method_name, args, kwargs = any_string_method | ||
|
||
# TODO: get rid of these xfails | ||
if (method_name not in ['encode', 'decode', 'len'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment as above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These xfails will be gone, and then the test reads very clearly, IMO There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this test cannot be broken up as easily, because the allowed types depend on the method being checked! |
||
|
@@ -220,6 +258,10 @@ def test_api_per_method(self, box, dtype, | |
if (method_name in ['partition', 'rpartition'] and box == Index | ||
and inferred_dtype != 'bytes'): | ||
pytest.xfail(reason='Method not nan-safe on Index; see GH 23558') | ||
if (method_name == 'split' and box == Index | ||
and inferred_dtype in ['mixed', 'mixed-integer'] | ||
and dtype == object and kwargs.get('expand', None) == True): | ||
pytest.xfail(reason='Method not nan-safe on Index; see GH 23677') | ||
|
||
t = box(values, dtype=dtype) # explicit dtype to avoid casting | ||
method = getattr(t.str, method_name) | ||
|
@@ -236,14 +278,34 @@ def test_api_per_method(self, box, dtype, | |
+ ['mixed', 'mixed-integer'] * mixed_allowed) | ||
|
||
if inferred_dtype in allowed_types: | ||
method(*minimal_args) # works! | ||
method(*args, **kwargs) # works! | ||
else: | ||
# GH 23011, GH 23163 | ||
msg = ('Cannot use .str.{name} with values of inferred dtype ' | ||
'{inferred_dtype!r}.'.format(name=method_name, | ||
inferred_dtype=inferred_dtype)) | ||
with tm.assert_raises_regex(TypeError, msg): | ||
method(*minimal_args) | ||
with pytest.raises(TypeError, match=msg): | ||
method(*args, **kwargs) | ||
|
||
def test_api_for_categorical(self, any_string_method): | ||
9B80
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These are the remnants of |
||
# https://github.com/pandas-dev/pandas/issues/10661 | ||
s = Series(list('aabb')) | ||
s = s + " " + s | ||
c = s.astype('category') | ||
assert isinstance(c.str, strings.StringMethods) | ||
|
||
method_name, args, kwargs = any_string_method | ||
|
||
result = getattr(c.str, method_name)(*args, **kwargs) | ||
expected = getattr(s.str, method_name)(*args, **kwargs) | ||
|
||
if isinstance(result, DataFrame): | ||
tm.assert_frame_equal(result, expected) | ||
elif isinstance(result, Series): | ||
tm.assert_series_equal(result, expected) | ||
else: | ||
# str.cat(others=None) returns string, for example | ||
assert result == expected | ||
|
||
def test_iter(self): | ||
# GH3638 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The full list of these functions & arg-combinations is reflected in the
any_string_method
-fixture oftest_strings.py
now.