@@ -117,6 +117,22 @@ def clean_formatter_state():
117
117
reset_formatter ()
118
118
119
119
120
+ @pytest .fixture
121
+ def null_df ():
122
+ """Create a DataFrame with null values of different types."""
123
+ ctx = SessionContext ()
124
+
125
+ # Create a RecordBatch with nulls across different types
126
+ batch = pa .RecordBatch .from_arrays ([
127
+ pa .array ([1 , None , 3 , None ], type = pa .int64 ()),
128
+ pa .array ([4.5 , 6.7 , None , None ], type = pa .float64 ()),
129
+ pa .array (["a" , None , "c" , None ], type = pa .string ()),
130
+ pa .array ([True , None , False , None ], type = pa .bool_ ()),
131
+ ], names = ["int_col" , "float_col" , "str_col" , "bool_col" ])
132
+
133
+ return ctx .create_dataframe ([[batch ]])
134
+
135
+
120
136
def test_select (df ):
121
137
df_1 = df .select (
122
138
column ("a" ) + column ("b" ),
@@ -1642,8 +1658,112 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
1642
1658
local_formatter = DataFrameHtmlFormatter (use_shared_styles = False )
1643
1659
1644
1660
# Both calls should include styles
1661
+
1645
1662
local_html_1 = local_formatter .format_html ([batch ], batch .schema )
1646
1663
local_html_2 = local_formatter .format_html ([batch ], batch .schema )
1647
1664
1648
1665
assert "<style>" in local_html_1
1649
1666
assert "<style>" in local_html_2
1667
+
1668
+
1669
+ def test_fill_null_basic (null_df ):
1670
+ """Test basic fill_null functionality with a single value."""
1671
+ # Fill all nulls with 0
1672
+ filled_df = null_df .fill_null (0 )
1673
+
1674
+ result = filled_df .collect ()[0 ]
1675
+
1676
+ # Check that nulls were filled with 0 (or equivalent)
1677
+ assert result .column (0 ).to_pylist () == [1 , 0 , 3 , 0 ]
1678
+ assert result .column (1 ).to_pylist () == [4.5 , 6.7 , 0.0 , 0.0 ]
1679
+ # String column should be filled with "0"
1680
+ assert result .column (2 ).to_pylist () == ["a" , "0" , "c" , "0" ]
1681
+ # Boolean column should be filled with False (0 converted to bool)
1682
+ assert result .column (3 ).to_pylist () == [True , False , False , False ]
1683
+
1684
+
1685
+ def test_fill_null_subset (null_df ):
1686
+ """Test filling nulls only in a subset of columns."""
1687
+ # Fill nulls only in numeric columns
1688
+ filled_df = null_df .fill_null (0 , subset = ["int_col" , "float_col" ])
1689
+
1690
+ result = filled_df .collect ()[0 ]
1691
+
1692
+ # Check that nulls were filled only in specified columns
1693
+ assert result .column (0 ).to_pylist () == [1 , 0 , 3 , 0 ]
1694
+ assert result .column (1 ).to_pylist () == [4.5 , 6.7 , 0.0 , 0.0 ]
1695
+ # These should still have nulls
1696
+ assert None in result .column (2 ).to_pylist ()
1697
+ assert None in result .column (3 ).to_pylist ()
1698
+
1699
+
1700
+ def test_fill_null_specific_types (null_df ):
1701
+ """Test filling nulls with type-appropriate values."""
1702
+ # Fill with type-specific values
1703
+ filled_df = null_df .fill_null ("missing" )
1704
+
1705
+ result = filled_df .collect ()[0 ]
1706
+
1707
+ # Check that nulls were filled appropriately by type
1708
+ assert result .column (0 ).to_pylist () == [1 , 0 , 3 , 0 ] # Int gets 0 from "missing" conversion
1709
+ assert result .column (1 ).to_pylist () == [4.5 , 6.7 , 0.0 , 0.0 ] # Float gets 0.0
1710
+ assert result .column (2 ).to_pylist () == ["a" , "missing" , "c" , "missing" ] # String gets "missing"
1711
+ assert result .column (3 ).to_pylist () == [True , False , False , False ] # Bool gets False
1712
+
1713
+
1714
+ def test_fill_null_immutability (null_df ):
1715
+ """Test that original DataFrame is unchanged after fill_null."""
1716
+ # Get original values with nulls
1717
+ original = null_df .collect ()[0 ]
1718
+ original_int_nulls = original .column (0 ).to_pylist ().count (None )
1719
+
1720
+ # Apply fill_null
1721
+ filled_df = null_df .fill_null (0 )
1722
+
1723
+ # Check that original is unchanged
1724
+ new_original = null_df .collect ()[0 ]
1725
+ new_original_int_nulls = new_original .column (0 ).to_pylist ().count (None )
1726
+
1727
+ assert original_int_nulls == new_original_int_nulls
1728
+ assert original_int_nulls > 0 # Ensure we actually had nulls in the first place
1729
+
1730
+
1731
+ def test_fill_null_empty_df (ctx ):
1732
+ """Test fill_null on empty DataFrame."""
1733
+ # Create an empty DataFrame with schema
1734
+ batch = pa .RecordBatch .from_arrays (
1735
+ [pa .array ([], type = pa .int64 ()), pa .array ([], type = pa .string ())],
1736
+ names = ["a" , "b" ]
1737
+ )
1738
+ empty_df = ctx .create_dataframe ([[batch ]])
1739
+
1740
+ # Fill nulls (should work without errors)
1741
+ filled_df = empty_df .fill_null (0 )
1742
+
1743
+ # Should still be empty but with same schema
1744
+ result = filled_df .collect ()[0 ]
1745
+ assert len (result .column (0 )) == 0
1746
+ assert len (result .column (1 )) == 0
1747
+ assert result .schema .field (0 ).name == "a"
1748
+ assert result .schema .field (1 ).name == "b"
1749
+
1750
+
1751
+ def test_fill_null_all_null_column (ctx ):
1752
+ """Test fill_null on a column with all nulls."""
1753
+ # Create DataFrame with a column of all nulls
1754
+ batch = pa .RecordBatch .from_arrays (
1755
+ [pa .array ([1 , 2 , 3 ]), pa .array ([None , None , None ], type = pa .string ())],
1756
+ names = ["a" , "b" ]
1757
+ )
1758
+ all_null_df = ctx .create_dataframe ([[batch ]])
1759
+
1760
+ # Fill nulls with a value
1761
+ filled_df = all_null_df .fill_null ("filled" )
1762
+
1763
+ # Check that all nulls were filled
1764
+ result = filled_df .collect ()[0 ]
1765
+ assert result .column (1 ).to_pylist () == ["filled" , "filled" , "filled" ]
1766
+
1767
+ # Original should be unchanged
1768
+ original = all_null_df .collect ()[0 ]
1769
+ assert original .column (1 ).null_count == 3
0 commit comments