8000 test: add tests for fill_null functionality in DataFrame with null va… · kosiew/datafusion-python@b140523 · GitHub
[go: up one dir, main page]

Skip to content

Commit b140523

Browse files
committed
test: add tests for fill_null functionality in DataFrame with null values
1 parent b89c695 commit b140523

File tree

1 file changed

+120
-0
lines changed

1 file changed

+120
-0
lines changed

python/tests/test_dataframe.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,22 @@ def clean_formatter_state():
117117
reset_formatter()
118118

119119

120+
@pytest.fixture
121+
def null_df():
122+
"""Create a DataFrame with null values of different types."""
123+
ctx = SessionContext()
124+
125+
# Create a RecordBatch with nulls across different types
126+
batch = pa.RecordBatch.from_arrays([
127+
pa.array([1, None, 3, None], type=pa.int64()),
128+
pa.array([4.5, 6.7, None, None], type=pa.float64()),
129+
pa.array(["a", None, "c", None], type=pa.string()),
130+
pa.array([True, None, False, None], type=pa.bool_()),
131+
], names=["int_col", "float_col", "str_col", "bool_col"])
132+
133+
return ctx.create_dataframe([[batch]])
134+
135+
120136
def test_select(df):
121137
df_1 = df.select(
122138
column("a") + column("b"),
@@ -1642,8 +1658,112 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
16421658
local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
16431659

16441660
# Both calls should include styles
1661+
16451662
local_html_1 = local_formatter.format_html([batch], batch.schema)
16461663
local_html_2 = local_formatter.format_html([batch], batch.schema)
16471664

16481665
assert "<style>" in local_html_1
16491666
assert "<style>" in local_html_2
1667+
1668+
1669+
def test_fill_null_basic(null_df):
1670+
"""Test basic fill_null functionality with a single value."""
1671+
# Fill all nulls with 0
1672+
filled_df = null_df.fill_null(0)
1673+
1674+
result = filled_df.collect()[0]
1675+
1676+
# Check that nulls were filled with 0 (or equivalent)
1677+
assert result.column(0).to_pylist() == [1, 0, 3, 0]
1678+
assert result.column(1).to_pylist() == [4.5, 6.7, 0.0, 0.0]
1679+
# String column should be filled with "0"
1680+
assert result.column(2).to_pylist() == ["a", "0", "c", "0"]
1681+
# Boolean column should be filled with False (0 converted to bool)
1682+
assert result.column(3).to_pylist() == [True, False, False, False]
1683+
1684+
1685+
def test_fill_null_subset(null_df):
1686+
"""Test filling nulls only in a subset of columns."""
1687+
# Fill nulls only in numeric columns
1688+
filled_df = null_df.fill_null(0, subset=["int_col", "float_col"])
1689+
1690+
result = filled_df.collect()[0]
1691+
1692+
# Check that nulls were filled only in specified columns
1693+
assert result.column(0).to_pylist() == [1, 0, 3, 0]
1694+
assert result.column(1).to_pylist() == [4.5, 6.7, 0.0, 0.0]
1695+
# These should still have nulls
1696+
assert None in result.column(2).to_pylist()
1697+
assert None in result.column(3).to_pylist()
1698+
1699+
1700+
def test_fill_null_specific_types(null_df):
1701+
"""Test filling nulls with type-appropriate values."""
1702+
# Fill with type-specific values
1703+
filled_df = null_df.fill_null("missing")
1704+
1705+
result = filled_df.collect()[0]
1706+
1707+
# Check that nulls were filled appropriately by type
1708+
assert result.column(0).to_pylist() == [1, 0, 3, 0] # Int gets 0 from "missing" conversion
1709+
assert result.column(1).to_pylist() == [4.5, 6.7, 0.0, 0.0] # Float gets 0.0
1710+
assert result.column(2).to_pylist() == ["a", "missing", "c", "missing"] # String gets "missing"
1711+
assert result.column(3).to_pylist() == [True, False, False, False] # Bool gets False
1712+
1713+
1714+
def test_fill_null_immutability(null_df):
1715+
"""Test that original DataFrame is unchanged after fill_null."""
1716+
# Get original values with nulls
1717+
original = null_df.collect()[0]
1718+
original_int_nulls = original.column(0).to_pylist().count(None)
1719+
1720+
# Apply fill_null
1721+
filled_df = null_df.fill_null(0)
1722+
1723+
# Check that original is unchanged
1724+
new_original = null_df.collect()[0]
1725+
new_original_int_nulls = new_original.column(0).to_pylist().count(None)
1726+
1727+
assert original_int_nulls == new_original_int_nulls
1728+
assert original_int_nulls > 0 # Ensure we actually had nulls in the first place
1729+
1730+
1731+
def test_fill_null_empty_df(ctx):
1732+
"""Test fill_null on empty DataFrame."""
1733+
# Create an empty DataFrame with schema
1734+
batch = pa.RecordBatch.from_arrays(
1735+
[pa.array([], type=pa.int64()), pa.array([], type=pa.string())],
1736+
names=["a", "b"]
1737+
)
1738+
empty_df = ctx.create_dataframe([[batch]])
1739+
1740+
# Fill nulls (should work without errors)
1741+
filled_df = empty_df.fill_null(0)
1742+
1743+
# Should still be empty but with same schema
1744+
result = filled_df.collect()[0]
1745+
assert len(result.column(0)) == 0
1746+
assert len(result.column(1)) == 0
1747+
assert result.schema.field(0).name == "a"
1748+
assert result.schema.field(1).name == "b"
1749+
1750+
1751+
def test_fill_null_all_null_column(ctx):
1752+
"""Test fill_null on a column with all nulls."""
1753+
# Create DataFrame with a column of all nulls
1754+
batch = pa.RecordBatch.from_arrays(
1755+
[pa.array([1, 2, 3]), pa.array([None, None, None], type=pa.string())],
1756+
names=["a", "b"]
1757+
)
1758+
all_null_df = ctx.create_dataframe([[batch]])
1759+
1760+
# Fill nulls with a value
1761+
filled_df = all_null_df.fill_null("filled")
1762+
1763+
# Check that all nulls were filled
1764+
result = filled_df.collect()[0]
1765+
assert result.column(1).to_pylist() == ["filled", "filled", "filled"]
1766+
1767+
# Original should be unchanged
1768+
original = all_null_df.collect()[0]
1769+
assert original.column(1).null_count == 3

0 commit comments

Comments
 (0)
0