8000 TST (string dtype): change any_string_dtype fixture to use actual dtype instances by jorisvandenbossche · Pull Request #59345 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

TST (string dtype): change any_string_dtype fixture to use actual dtype instances #59345

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Next Next commit
TST (string dtype): change any_string_dtype fixture to use actual dty…
…pe instances
  • Loading branch information
jorisvandenbossche committed Jul 30, 2024
commit 4d875d1c50e6af0bf37fc0fe851c0030abe7eb21
23 changes: 16 additions & 7 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1354,18 +1354,27 @@ def object_dtype(request):

@pytest.fixture(
params=[
"object",
"string[python]",
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
]
np.dtype("object"),
pd.StringDtype("python"),
pytest.param(pd.StringDtype("pyarrow"), marks=td.skip_if_no("pyarrow")),
pytest.param(
pd.StringDtype("pyarrow", na_value=np.nan), marks=td.skip_if_no("pyarrow")
),
],
ids=[
"string=object",
"string=string[python]",
"string=string[pyarrow]",
"string=str[pyarrow]",
],
)
def any_string_dtype(request):
"""
Parametrized fixture for string dtypes.
* 'object'
* 'string[python]'
* 'string[pyarrow]'
* 'string[python]' (NA variant)
* 'string[pyarrow]' (NA variant)
* 'str' (NaN variant, with pyarrow)
"""
return request.param

Expand Down
10 changes: 9 additions & 1 deletion pandas/tests/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,15 @@

import pandas as pd

object_pyarrow_numpy = ("object", "string[pyarrow_numpy]")

def is_object_or_nan_string_dtype(dtype):
"""
Check if string-like dtype is following NaN semantics, i.e. is object
dtype or a NaN-variant of the StringDtype.
"""
return (isinstance(dtype, np.dtype) and dtype == "object") or (
dtype.na_value is np.nan
)


def _convert_na_value(ser, expected):
Expand Down
70 changes: 52 additions & 18 deletions pandas/tests/strings/test_find_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from pandas.tests.strings import (
_convert_na_value,
object_pyarrow_numpy,
is_object_or_nan_string_dtype,
)

# --------------------------------------------------------------------------------------
Expand All @@ -33,7 +33,9 @@ def test_contains(any_string_dtype):
pat = "mmm[_]+"

result = values.str.contains(pat)
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series(
np.array([False, np.nan, True, True, False], dtype=np.object_),
dtype=expected_dtype,
Expand All @@ -52,7 +54,9 @@ def test_contains(any_string_dtype):
dtype=any_string_dtype,
)
result = values.str.contains(pat)
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
tm.assert_series_equal(result, expected)

Expand All @@ -79,14 +83,18 @@ def test_contains(any_string_dtype):
pat = "mmm[_]+"

result = values.str.contains(pat)
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series(
np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype
)
tm.assert_series_equal(result, expected)

result = values.str.contains(pat, na=False)
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -171,7 +179,9 @@ def test_contains_moar(any_string_dtype):
)

result = s.str.contains("a")
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series(
[False, False, False, True, True, False, np.nan, False, False, True],
dtype=expected_dtype,
Expand Down Expand Up @@ -212,7 +222,9 @@ def test_contains_nan(any_string_dtype):
s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype)

result = s.str.contains("foo", na=False)
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series([False, False, False], dtype=expected_dtype)
tm.assert_series_equal(result, expected)

Expand All @@ -230,7 +242,9 @@ def test_contains_nan(any_string_dtype):
tm.assert_series_equal(result, expected)

result = s.str.contains("foo")
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -675,7 +689,9 @@ def test_replace_regex_single_character(regex, any_string_dtype):

def test_match(any_string_dtype):
# New match behavior introduced in 0.13
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)

values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
result = values.str.match(".*(BAD[_]+).*(BAD)")
Expand Down Expand Up @@ -730,20 +746,26 @@ def test_match_na_kwarg(any_string_dtype):
s = Series(["a", "b", np.nan], dtype=any_string_dtype)

result = s.str.match("a", na=False)
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series([True, False, False], dtype=expected_dtype)
tm.assert_series_equal(result, expected)

result = s.str.match("a")
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series([True, False, np.nan], dtype=expected_dtype)
tm.assert_series_equal(result, expected)


def test_match_case_kwarg(any_string_dtype):
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
result = values.str.match("ab", case=False)
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series([True, True, True, True], dtype=expected_dtype)
tm.assert_series_equal(result, expected)

Expand All @@ -759,7 +781,9 @@ def test_fullmatch(any_string_dtype):
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
)
result = ser.str.fullmatch(".*BAD[_]+.*BAD")
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series([True, False, np.nan, False], dtype=expected_dtype)
tm.assert_series_equal(result, expected)

Expand All @@ -768,7 +792,9 @@ def test_fullmatch_dollar_literal(any_string_dtype):
# GH 56652
ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype)
result = ser.str.fullmatch("foo\\$")
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series([False, False, np.nan, True], dtype=expected_dtype)
tm.assert_series_equal(result, expected)

Expand All @@ -778,14 +804,18 @@ def test_fullmatch_na_kwarg(any_string_dtype):
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
)
result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False)
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series([True, False, False, False], dtype=expected_dtype)
tm.assert_series_equal(result, expected)


def test_fullmatch_case_kwarg(any_string_dtype, performance_warning):
ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)

expected = Series([True, False, False, False], dtype=expected_dtype)

Expand Down Expand Up @@ -859,7 +889,9 @@ def test_find(any_string_dtype):
ser = Series(
["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype
)
expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64"
expected_dtype = (
np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
)

result = ser.str.find("EF")
expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype)
Expand Down Expand Up @@ -911,7 +943,9 @@ def test_find_nan(any_string_dtype):
ser = Series(
["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype
)
expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64"
expected_dtype = (
np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
)

result = ser.str.find("EF")
expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/strings/test_split_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
from pandas.tests.strings import (
_convert_na_value,
object_pyarrow_numpy,
is_object_or_nan_string_dtype,
)


Expand Down Expand Up @@ -385,7 +385,7 @@ def test_split_nan_expand(any_string_dtype):
# check that these are actually np.nan/pd.NA and not None
# TODO see GH 18463
# tm.assert_frame_equal does not differentiate
if any_string_dtype in object_pyarrow_numpy:
if is_object_or_nan_string_dtype(any_string_dtype):
assert all(np.isnan(x) for x in result.iloc[1])
else:
assert all(x is pd.NA for x in result.iloc[1])
Expand Down
32 changes: 23 additions & 9 deletions pandas/tests/strings/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
import pandas._testing as tm
from pandas.core.strings.accessor import StringMethods
from pandas.tests.strings import object_pyarrow_numpy
from pandas.tests.strings import is_object_or_nan_string_dtype


@pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])])
Expand All @@ -41,7 +41,9 @@ def test_iter_raises():
def test_count(any_string_dtype):
ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype)
result = ser.str.count("f[o]+")
expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64"
expected_dtype = (
np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
)
expected = Series([1, 2, np.nan, 4], dtype=expected_dtype)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -93,7 +95,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat):

def test_empty_str_methods(any_string_dtype):
empty_str = empty = Series(dtype=any_string_dtype)
if any_string_dtype in object_pyarrow_numpy:
if is_object_or_nan_string_dtype(any_string_dtype):
empty_int = Series(dtype="int64")
empty_bool = Series(dtype=bool)
else:
Expand Down Expand Up @@ -207,7 +209,9 @@ def test_ismethods(method, expected, any_string_dtype):
ser = Series(
["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype
)
expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series(expected, dtype=expected_dtype)
result = getattr(ser.str, method)()
tm.assert_series_equal(result, expected)
Expand All @@ -233,7 +237,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
["A", "3", "¼", "★", "፸", "3", "four"], # noqa: RUF001
dtype=any_string_dtype,
)
expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series(expected, dtype=expected_dtype)
result = getattr(ser.str, method)()
tm.assert_series_equal(result, expected)
Expand All @@ -253,7 +259,9 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
def test_isnumeric_unicode_missing(method, expected, any_string_dtype):
values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001
ser = Series(values, dtype=any_string_dtype)
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
expected_dtype = (
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
)
expected = Series(expected, dtype=expected_dtype)
result = getattr(ser.str, method)()
tm.assert_series_equal(result, expected)
Expand Down Expand Up @@ -284,7 +292,9 @@ def test_len(any_string_dtype):
dtype=any_string_dtype,
)
result = ser.str.len()
expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64"
expected_dtype = (
"float64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
)
expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -313,7 +323,9 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec
obj = index_or_series(
["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
)
expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64"
expected_dtype = (
np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
)
expected = index_or_series(expected, dtype=expected_dtype)

result = getattr(obj.str, method)(sub, start, end)
Expand Down Expand Up @@ -354,7 +366,9 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method):
)
def test_index_missing(any_string_dtype, method, exp):
ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype)
expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64"
expected_dtype = (
np.float64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
)

result = getattr(ser.str, method)("b")
expected = Series(exp + [np.nan], dtype=expected_dtype)
Expand Down
Loading
0