-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
TST: add method/dtype coverage to str-accessor; precursor to #23167 #23582
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
91b720e
dcee05a
41cecb9
01a3c10
a94f569
f0ae1db
16fe71c
78cead0
0aaa04e
9b36a50
c0752eb
a53a28e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,20 @@ | ||
# -*- coding: utf-8 -*- | ||
# pylint: disable-msg=E1101,W0612 | ||
|
||
from datetime import datetime, timedelta | ||
from datetime import datetime, date, timedelta, time | ||
import pytest | ||
import re | ||
from decimal import Decimal | ||
|
||
from numpy import nan as NA | ||
import numpy as np | ||
from numpy.random import randint | ||
|
||
from pandas.compat import range, u | ||
from pandas.compat import range, u, PY3 | ||
import pandas.compat as compat | ||
from pandas import Index, Series, DataFrame, isna, MultiIndex, notna, concat | ||
from pandas import (Index, Series, DataFrame, isna, MultiIndex, notna, concat, | ||
Timestamp, Period, NaT, Interval) | ||
import pandas._libs.lib as lib | ||
|
||
from pandas.util.testing import assert_series_equal, assert_index_equal | ||
import pandas.util.testing as tm | ||
|
@@ -26,6 +29,157 @@ def assert_series_or_index_equal(left, right): | |
assert_index_equal(left, right) | ||
|
||
|
||
# method names plus minimal set of arguments to call | ||
_all_string_methods = [ | ||
('get', [0]), | ||
('join', [',']), | ||
('contains', ['some_pattern']), | ||
('match', ['some_pattern']), | ||
('count', ['some_pattern']), | ||
('startswith', ['some_pattern']), | ||
('endswith', ['some_pattern']), | ||
('findall', ['some_pattern']), | ||
('find', ['some_pattern']), | ||
('rfind', ['some_pattern']), | ||
# because "index"/"rindex" fail (intentionally) if the string is not found | ||
# (and we're testing on generic data), search only for empty string | ||
('index', ['']), | ||
('rindex', ['']), | ||
('extract', [r'(some_pattern)']), | ||
('extractall', [r'(some_pattern)']), | ||
('replace', ['some_pattern', 'other_pattern']), | ||
('repeat', [10]), | ||
('pad', [10]), | ||
('center', [10]), | ||
('ljust', [10]), | ||
('rjust', [10]), | ||
('zfill', [10]), | ||
('wrap', [10]), | ||
('encode', ['utf8']), | ||
('decode', ['utf8']), | ||
('translate', [{97: 100}]), # translating 'a' to 'd' | ||
('normalize', ['NFC']) | ||
] + list(zip([ | ||
# methods without positional arguments: zip with empty tuple | ||
'cat', 'len', 'split', 'rsplit', | ||
'partition', 'rpartition', 'get_dummies', | ||
'slice', 'slice_replace', | ||
'strip', 'lstrip', 'rstrip', | ||
'lower', 'upper', 'capitalize', | ||
'title', 'swapcase', | ||
'isalpha', 'isnumeric', 'isalnum', | ||
'isdigit', 'isdecimal', 'isspace', | ||
'islower', 'isupper', 'istitle' | ||
], [tuple()] * 100)) | ||
ids, _ = zip(*_all_string_methods) # use method name as fixture-id | ||
|
||
|
||
@pytest.fixture(params=_all_string_methods, ids=ids) | ||
def all_string_methods(request): | ||
""" | ||
Fixture for all public methods of `StringMethods` | ||
|
||
This fixture returns a tuple of the method name and a list of sample values | ||
for the required positional arguments of that method. | ||
""" | ||
return request.param | ||
|
||
|
||
_all_allowed_skipna_inferred_dtypes = [ | ||
('string', ['a', np.nan, 'c']), | ||
('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), | ||
('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), | ||
('empty', [np.nan, np.nan, np.nan]), | ||
('empty', []), | ||
('mixed-integer', ['a', np.nan, 2]), | ||
('mixed', ['a', np.nan, 2.0])] | ||
ids, _ = zip(*_all_allowed_skipna_inferred_dtypes) # use inferred type as id | ||
|
||
|
||
@pytest.fixture(params=_all_allowed_skipna_inferred_dtypes, ids=ids) | ||
def all_allowed_skipna_inferred_dtypes(request): | ||
""" | ||
Fixture for all (inferred) dtypes allowed in StringMethods.__init__ | ||
|
||
Returns an np.ndarray that will be inferred to have the given dtype (when | ||
skipping missing values). | ||
|
||
The allowed (inferred) types are: | ||
* 'string' | ||
* 'unicode' (if PY2) | ||
* 'empty' | ||
* 'bytes' (if PY3) | ||
* 'mixed' | ||
* 'mixed-integer' | ||
""" | ||
inferred_dtype, values = request.param | ||
values = np.array(values, dtype=object) # object dtype to avoid casting | ||
|
||
# make sure the inferred dtype of the fixture is as requested | ||
assert inferred_dtype == lib.infer_dtype(values, skipna=True) | ||
|
||
return inferred_dtype, values | ||
|
||
|
||
# categoricals are handled separately | ||
_all_skipna_inferred_dtypes = _all_allowed_skipna_inferred_dtypes + [ | ||
('floating', [1.0, np.nan, 2.0]), | ||
('integer', [1, np.nan, 2]), | ||
('mixed-integer-float', [1, np.nan, 2.0]), | ||
('decimal', [Decimal(1), np.nan, Decimal(2)]), | ||
('boolean', [True, np.nan, False]), | ||
('datetime64', [np.datetime64('2013-01-01'), np.nan, | ||
np.datetime64('2018-01-01')]), | ||
('datetime', [Timestamp('20130101'), np.nan, Timestamp('20180101')]), | ||
('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), | ||
# The following two dtypes are commented out due to GH 23554 | ||
# ('complex', [1 + 1j, np.nan, 2 + 2j]), | ||
# ('timedelta64', [np.timedelta64(1, 'D'), | ||
# np.nan, np.timedelta64(2, 'D')]), | ||
('timedelta', [timedelta(1), np.nan, timedelta(2)]), | ||
('time', [time(1), np.nan, time(2)]), | ||
('period', [Period(2013), NaT, Period(2018)]), | ||
('interval', [Interval(0, 1), np.nan, Interval(0, 2)])] | ||
ids, _ = zip(*_all_skipna_inferred_dtypes) # use inferred type as fixture-id | ||
|
||
|
||
@pytest.fixture(params=_all_skipna_inferred_dtypes, ids=ids) | ||
def all_skipna_inferred_dtypes(request): | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so makes sense to also use these in the inferred_type tests then. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean for the tests in They're being used in this module already. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure what you mean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm asking if you want me to move this particular fixture to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes |
||
Fixture for all inferred dtypes from _libs.lib.infer_dtype | ||
|
||
Returns an np.ndarray that will be inferred to have the given dtype (when | ||
skipping missing values). | ||
|
||
The covered (inferred) types are: | ||
* 'string' | ||
* 'unicode' (if PY2) | ||
* 'empty' | ||
* 'bytes' (if PY3) | ||
* 'mixed' | ||
* 'mixed-integer' | ||
* 'mixed-integer-float' | ||
* 'floating' | ||
* 'integer' | ||
* 'decimal' | ||
* 'boolean' | ||
* 'datetime64' | ||
* 'datetime' | ||
* 'date' | ||
* 'timedelta' | ||
* 'time' | ||
* 'period' | ||
* 'interval' | ||
""" | ||
inferred_dtype, values = request.param | ||
values = np.array(values, dtype=object) # object dtype to avoid casting | ||
|
||
# make sure the inferred dtype of the fixture is as requested | ||
assert inferred_dtype == lib.infer_dtype(values, skipna=True) | ||
|
||
return inferred_dtype, values | ||
|
||
|
||
class TestStringMethods(object): | ||
|
||
def test_api(self): | ||
|
@@ -34,11 +188,115 @@ def test_api(self): | |
assert Series.str is strings.StringMethods | ||
assert isinstance(Series(['']).str, strings.StringMethods) | ||
|
||
# GH 9184 | ||
invalid = Series([1]) | ||
with pytest.raises(AttributeError, match="only use .str accessor"): | ||
invalid.str | ||
assert not hasattr(invalid, 'str') | ||
@pytest.mark.parametrize('dtype', [object, 'category']) | ||
@pytest.mark.parametrize('box', [Series, Index]) | ||
def test_api_per_dtype(self, box, dtype, all_skipna_inferred_dtypes): | ||
# one instance of parametrized fixture | ||
inferred_dtype, values = all_skipna_inferred_dtypes | ||
|
||
t = box(values, dtype=dtype) # explicit dtype to avoid casting | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can actually make this 2 tests by putting the xfails in a function. then the test becomes single purpose and you don't have the if statement near the bottom. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand what you're asking for here, sorry. The test is already very single-purpose (except the xfails, which will be gone with #23167 and follow-up PRs), and the final if-switch makes it transparent which types are actually passing the constructor, with all other types raising. this would only get harder to understand if it's split into two, no? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no you have an if else branch. it makes reasoning about this impossible as its very fixture / data dependent. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The if branch is transparently for the cases where the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if/else make the test very hard to understand. pls break in 2 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This particular test can be broken into
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Respectfully, I find it an unrealistic criterion to not be able to use one simple if-condition in a test (aside from xfails, which will be gone soon after). The whole point is that it's got an extensively parametrized fixture ( Even if I were to split this test into |
||
|
||
# TODO: get rid of these xfails | ||
if dtype == 'category' and inferred_dtype in ['period', 'interval']: | ||
pytest.xfail(reason='Conversion to numpy array fails because ' | ||
'the ._values-attribute is not a numpy array for ' | ||
'PeriodArray/IntervalArray; see GH 23553') | ||
if box == Index and inferred_dtype in ['empty', 'bytes']: | ||
pytest.xfail(reason='Raising too restrictively; ' | ||
'solved by GH 23167') | ||
if (box == Index and dtype == object | ||
and inferred_dtype in ['boolean', 'date', 'time']): | ||
pytest.xfail(reason='Inferring incorrectly because of NaNs; ' | ||
'solved by GH 23167') | ||
if (box == Series | ||
and (dtype == object and inferred_dtype not in [ | ||
'string', 'unicode', 'empty', | ||
'bytes', 'mixed', 'mixed-integer']) | ||
or (dtype == 'category' | ||
and inferred_dtype in ['decimal', 'boolean', 'time'])): | ||
pytest.xfail(reason='Not raising correctly; solved by GH 23167') | ||
|
||
types_passing_constructor = ['string', 'unicode', 'empty', | ||
'bytes', 'mixed', 'mixed-integer'] | ||
if inferred_dtype in types_passing_constructor: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see my comment above |
||
# GH 6106 | ||
assert isinstance(t.str, strings.StringMethods) | ||
else: | ||
# GH 9184, GH 23011, GH 23163 | ||
with tm.assert_raises_regex(AttributeError, 'Can only use .str ' | ||
'accessor with string values.*'): | ||
t.str | ||
assert not hasattr(t, 'str') | ||
|
||
@pytest.mark.xfail(reason='not correctly raising on master; ' | ||
'solved by GH 23167') | ||
def test_api_mi_raises(self): | ||
mi = MultiIndex.from_arrays([['a', 'b', 'c']]) | ||
with tm.assert_raises_regex(AttributeError, 'Can only use .str ' | ||
'accessor with Index, not MultiIndex'): | ||
mi.str | ||
assert not hasattr(mi, 'str') | ||
|
||
@pytest.mark.parametrize('dtype', [object, 'category']) | ||
@pytest.mark.parametrize('box', [Series, Index]) | ||
def test_api_per_method(self, box, dtype, | ||
all_allowed_skipna_inferred_dtypes, | ||
all_string_methods): | ||
# this test does not check correctness of the different methods, | ||
# just that the methods work on the specified (inferred) dtypes, | ||
# and raise on all others | ||
|
||
# one instance of each parametrized fixture | ||
inferred_dtype, values = all_allowed_skipna_inferred_dtypes | ||
method_name, minimal_args = all_string_methods | ||
|
||
# TODO: get rid of these xfails | ||
if (method_name not in ['encode', 'decode', 'len'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment as above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These xfails will be gone, and then the test reads very clearly, IMO There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this test cannot be broken up as easily, because the allowed types depend on the method being checked! |
||
and inferred_dtype == 'bytes'): | ||
pytest.xfail(reason='Not raising for "bytes", see GH 23011;' | ||
'Also: malformed method names, see GH 23551; ' | ||
'solved by GH 23167') | ||
if (method_name == 'cat' | ||
and inferred_dtype in ['mixed', 'mixed-integer']): | ||
pytest.xfail(reason='Bad error message; should raise better; ' | ||
'solved by GH 23167') | ||
if box == Index and inferred_dtype in ['empty', 'bytes']: | ||
pytest.xfail(reason='Raising too restrictively; ' | ||
'solved by GH 23167') | ||
if (box == Index and dtype == object | ||
and inferred_dtype in ['boolean', 'date', 'time']): | ||
pytest.xfail(reason='Inferring incorrectly because of NaNs; ' | ||
'solved by GH 23167') | ||
if box == Index and dtype == 'category': | ||
pytest.xfail(reason='Broken methods on CategoricalIndex; ' | ||
'see GH 23556') | ||
if (method_name in ['partition', 'rpartition'] and box == Index | ||
and inferred_dtype != 'bytes'): | ||
pytest.xfail(reason='Method not nan-safe on Index; see GH 23558') | ||
|
||
t = box(values, dtype=dtype) # explicit dtype to avoid casting | ||
method = getattr(t.str, method_name) | ||
|
||
bytes_allowed = method_name in ['encode', 'decode', 'len'] | ||
# as of v0.23.4, all methods except 'cat' are very lenient with the | ||
# allowed data types, just returning NaN for entries that error. | ||
# This could be changed with an 'errors'-kwarg to the `str`-accessor, | ||
# see discussion in GH 13877 | ||
mixed_allowed = method_name not in ['cat'] | ||
|
||
allowed_types = (['string', 'unicode', 'empty'] | ||
+ ['bytes'] * bytes_allowed | ||
+ ['mixed', 'mixed-integer'] * mixed_allowed) | ||
|
||
if inferred_dtype in allowed_types: | ||
method(*minimal_args) # works! | ||
else: | ||
# GH 23011, GH 23163 | ||
msg = ('Cannot use .str.{name} with values of inferred dtype ' | ||
'{inferred_dtype!r}.'.format(name=method_name, | ||
inferred_dtype=inferred_dtype)) | ||
with tm.assert_raises_regex(TypeError, msg): | ||
method(*minimal_args) | ||
|
||
def test_iter(self): | ||
# GH3638 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is somewhat duplicating:
test_str_accessor_api_for_categorical
can you reconcile and put in a pandas/conftest.py
some of the fixtures can stay here if they are specific to what is being tested. but the top level general stuff should move.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I cannot find the test you're talking about - where is it supposed to be?
Do I understand correctly that you'd like the
_all_string_methods
inpandas/conftest.py
? I would think that fixture is very specific totest_strings
. Thedtype
fixtures make more sense on a higher level.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. How about moving this test to
test_strings
? It's testing the API, but is clearly only related to thestr
accessor.