10000 TST (string dtype): remove usage of 'string[pyarrow_numpy]' alias (#5… · pandas-dev/pandas@000ea36 · GitHub
[go: up one dir, main page]

Skip to content

Commit 000ea36

Browse files
TST (string dtype): remove usage of 'string[pyarrow_numpy]' alias (#59758)
1 parent d1052cf commit 000ea36

29 files changed

+119
-134
lines changed

pandas/conftest.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,6 +1228,34 @@ def string_dtype(request):
12281228
return request.param
12291229

12301230

1231+
@pytest.fixture(
1232+
params=[
1233+
("python", pd.NA),
1234+
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
1235+
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
1236+
("python", np.nan),
1237+
],
1238+
ids=[
1239+
"string=string[python]",
1240+
"string=string[pyarrow]",
1241+
"string=str[pyarrow]",
1242+
"string=str[python]",
1243+
],
1244+
)
1245+
def string_dtype_no_object(request):
1246+
"""
1247+
Parametrized fixture for string dtypes.
1248+
* 'string[python]' (NA variant)
1249+
* 'string[pyarrow]' (NA variant)
1250+
* 'str' (NaN variant, with pyarrow)
1251+
* 'str' (NaN variant, without pyarrow)
1252+
"""
1253+
# need to instantiate the StringDtype here instead of in the params
1254+
# to avoid importing pyarrow during test collection
1255+
storage, na_value = request.param
1256+
return pd.StringDtype(storage, na_value)
1257+
1258+
12311259
@pytest.fixture(
12321260
params=[
12331261
"string[python]",

pandas/tests/apply/test_numba.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pandas.util._test_decorators as td
77

8+
import pandas as pd
89
from pandas import (
910
DataFrame,
1011
Index,
@@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis):
2930

3031
def test_numba_vs_python_string_index():
3132
# GH#56189
32-
pytest.importorskip("pyarrow")
3333
df = DataFrame(
3434
1,
35-
index=Index(["a", "b"], dtype="string[pyarrow_numpy]"),
36-
columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
35+
index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
36+
columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)),
3737
)
3838
func = lambda x: x
3939
result = df.apply(func, engine="numba", axis=0)

pandas/tests/arrays/string_/test_string_arrow.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises():
241241
arr[[0, 1]] = ["foo", "bar", "baz"]
242242

243243

244-
@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"])
245-
def test_pickle_roundtrip(dtype):
244+
@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
245+
def test_pickle_roundtrip(na_value):
246246
# GH 42600
247247
pytest.importorskip("pyarrow")
248+
dtype = StringDtype("pyarrow", na_value=na_value)
248249
expected = pd.Series(range(10), dtype=dtype)
249250
expected_sliced = expected.head(2)
250251
full_pickled = pickle.dumps(expected)

pandas/tests/base/test_misc.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,7 @@ def test_access_by_position(index_flat):
180180
assert index[-1] == index[size - 1]
181181

182182
msg = f"index {size} is out of bounds for axis 0 with size {size}"
183-
if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
184-
index.dtype, "string[pyarrow_numpy]"
185-
):
183+
if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow":
186184
msg = "index out of bounds"
187185
with pytest.raises(IndexError, match=msg):
188186
index[size]

pandas/tests/frame/indexing/test_indexing.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1955,13 +1955,11 @@ def test_adding_new_conditional_column() -> None:
19551955
("dtype", "infer_string"),
19561956
[
19571957
(object, False),
1958-
("string[pyarrow_numpy]", True),
1958+
(pd.StringDtype(na_value=np.nan), True),
19591959
],
19601960
)
19611961
def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
19621962
# https://github.com/pandas-dev/pandas/issues/56204
1963-
pytest.importorskip("pyarrow")
1964-
19651963
df = DataFrame({"a": [1, 2], "b": [3, 4]})
19661964
with pd.option_context("future.infer_string", infer_string):
19671965
df.loc[df["a"] == 1, "c"] = "1"
@@ -1971,16 +1969,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
19711969
tm.assert_frame_equal(df, expected)
19721970

19731971

1974-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
19751972
def test_add_new_column_infer_string():
19761973
# GH#55366
1977-
pytest.importorskip("pyarrow")
19781974
df = DataFrame({"x": [1]})
19791975
with pd.option_context("future.infer_string", True):
19801976
df.loc[df["x"] == 1, "y"] = "1"
19811977
expected = DataFrame(
1982-
{"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")},
1983-
columns=Index(["x", "y"], dtype=object),
1978+
{"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))},
1979+
columns=Index(["x", "y"], dtype="str"),
19841980
)
19851981
tm.assert_frame_equal(df, expected)
19861982

pandas/tests/frame/methods/test_rank.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
)
1515
from pandas.compat import HAS_PYARROW
1616

17+
import pandas as pd
1718
from pandas import (
1819
DataFrame,
1920
Index,
@@ -509,14 +510,13 @@ def test_rank_mixed_axis_zero(self, data, expected):
509510
result = df.rank(numeric_only=True)
510511
tm.assert_frame_equal(result, expected)
511512

512-
@pytest.mark.parametrize(
513-
"dtype, exp_dtype",
514-
[("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
515-
)
516-
def test_rank_string_dtype(self, dtype, exp_dtype):
513+
def test_rank_string_dtype(self, string_dtype_no_object):
517514
# GH#55362
518-
pytest.importorskip("pyarrow")
519-
obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
515+
obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
520516
result = obj.rank(method="first")
517+
exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
518+
if string_dtype_no_object.storage == "python":
519+
# TODO nullable string[python] should also return nullable Int64
520+
exp_dtype = "float64"
521521
expected = Series([1, 2, None, 3], dtype=exp_dtype)
522522
tm.assert_series_equal(result, expected)

pandas/tests/frame/test_constructors.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2721,8 +2721,7 @@ def test_construct_with_strings_and_none(self):
27212721

27222722
def test_frame_string_inference(self):
27232723
# GH#54430
2724-
pytest.importorskip("pyarrow")
2725-
dtype = "string[pyarrow_numpy]"
2724+
dtype = pd.StringDtype(na_value=np.nan)
27262725
expected = DataFrame(
27272726
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
27282727
)
@@ -2756,8 +2755,7 @@ def test_frame_string_inference(self):
27562755

27572756
def test_frame_string_inference_array_string_dtype(self):
27582757
# GH#54496
2759-
pytest.importorskip("pyarrow")
2760-
dtype = "string[pyarrow_numpy]"
2758+
dtype = pd.StringDtype(na_value=np.nan)
27612759
expected = DataFrame(
27622760
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
27632761
)
@@ -2781,7 +2779,6 @@ def test_frame_string_inference_array_string_dtype(self):
27812779

27822780
def test_frame_string_inference_block_dim(self):
27832781
# GH#55363
2784-
pytest.importorskip("pyarrow")
27852782
with pd.option_context("future.infer_string", True):
27862783
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
27872784
assert df._mgr.blocks[0].ndim == 2

pandas/tests/groupby/methods/test_size.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33

44
from pandas._config import using_string_dtype
55

6-
import pandas.util._test_decorators as td
7-
86
from pandas.core.dtypes.common import is_integer_dtype
97

108
from pandas import (
@@ -111,16 +109,9 @@ def test_size_series_masked_type_returns_Int64(dtype):
111109

112110

113111
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
114-
@pytest.mark.parametrize(
115-
"dtype",
116-
[
117-
object,
118-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
119-
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
120-
],
121-
)
122-
def test_size_strings(dtype):
112+
def test_size_strings(any_string_dtype):
123113
# GH#55627
114+
dtype = any_string_dtype
124115
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
125116
result = df.groupby("a")["b"].size()
126117
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"

pandas/tests/groupby/methods/test_value_counts.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
import pandas.util._test_decorators as td
12-
1311
from pandas import (
1412
Categorical,
1513
CategoricalIndex,
@@ -389,14 +387,6 @@ def test_against_frame_and_seriesgroupby(
389387
tm.assert_frame_equal(result, expected)
390388

391389

392-
@pytest.mark.parametrize(
393-
"dtype",
394-
[
395-
object,
396-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
397-
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
398-
],
399-
)
400390
@pytest.mark.parametrize("normalize", [True, False])
401391
@pytest.mark.parametrize(
402392
"sort, ascending, expected_rows, expected_count, expected_group_size",
@@ -414,9 +404,10 @@ def test_compound(
414404
expected_rows,
415405
expected_count,
416406
expected_group_size,
417-
dtype,
407+
any_string_dtype,
418408
using_infer_string,
419409
):
410+
dtype = any_string_dtype
420411
education_df = education_df.astype(dtype)
421412
education_df.columns = education_df.columns.astype(dtype)
422413
# Multiple groupby keys and as_index=False
@@ -433,6 +424,7 @@ def test_compound(
433424
expected["proportion"] = expected_count
434425
expected["proportion"] /= expected_group_size
435426
if dtype == "string[pyarrow]":
427+
# TODO(nullable) also string[python] should return nullable dtypes
436428
expected["proportion"] = expected["proportion"].convert_dtypes()
437429
else:
438430
expected["count"] = expected_count

pandas/tests/groupby/test_groupby.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2832,20 +2832,13 @@ def test_rolling_wrong_param_min_period():
28322832
test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
28332833

28342834

2835-
@pytest.mark.parametrize(
2836-
"dtype",
2837-
[
2838-
object,
2839-
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
2840-
],
2841-
)
2842-
def test_by_column_values_with_same_starting_value(dtype):
2835+
def test_by_column_values_with_same_starting_value(any_string_dtype):
28432836
# GH29635
28442837
df = DataFrame(
28452838
{
28462839
"Name": ["Thomas", "Thomas", "Thomas John"],
28472840
"Credit": [1200, 1300, 900],
2848-
"Mood": Series(["sad", "happy", "happy"], dtype=dtype),
2841+
"Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype),
28492842
}
28502843
)
28512844
aggregate_details = {"Mood": Series.mode, "Credit": "sum"}

0 commit comments

Comments
 (0)
0