1717import os
1818import re
1919from typing import Any
20-
20+ import datetime
2121import pyarrow as pa
2222import pyarrow .parquet as pq
2323import pytest
@@ -128,7 +128,9 @@ def null_df():
128128 pa .array ([4.5 , 6.7 , None , None ], type = pa .float64 ()),
129129 pa .array (["a" , None , "c" , None ], type = pa .string ()),
130130 pa .array ([True , None , False , None ], type = pa .bool_ ()),
131- ], names = ["int_col" , "float_col" , "str_col" , "bool_col" ])
131+ pa .array ([10957 , None , 18993 , None ], type = pa .date32 ()), # 2000-01-01, null, 2022-01-01, null
132+ pa .array ([946684800000 , None , 1640995200000 , None ], type = pa .date64 ()), # 2000-01-01, null, 2022-01-01, null
133+ ], names = ["int_col" , "float_col" , "str_col" , "bool_col" , "date32_col" , "date64_col" ])
132134
133135 return ctx .create_dataframe ([[batch ]])
134136
@@ -1524,7 +1526,7 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame:
15241526def test_dataframe_repr_html_structure (df ) -> None :
15251527 """Test that DataFrame._repr_html_ produces expected HTML output structure."""
15261528 import re
1527-
1529+
15281530 output = df ._repr_html_ ()
15291531
15301532 # Since we've added a fair bit of processing to the html output, lets just verify
@@ -1658,14 +1660,12 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
16581660 local_formatter = DataFrameHtmlFormatter (use_shared_styles = False )
16591661
16601662 # Both calls should include styles
1661-
16621663 local_html_1 = local_formatter .format_html ([batch ], batch .schema )
16631664 local_html_2 = local_formatter .format_html ([batch ], batch .schema )
16641665
16651666 assert "<style>" in local_html_1
16661667 assert "<style>" in local_html_2
1667-
1668-
1668+
16691669def test_fill_null_basic (null_df ):
16701670 """Test basic fill_null functionality with a single value."""
16711671 # Fill all nulls with 0
@@ -1674,12 +1674,12 @@ def test_fill_null_basic(null_df):
16741674 result = filled_df .collect ()[0 ]
16751675
16761676 # Check that nulls were filled with 0 (or equivalent)
1677- assert result .column (0 ). to_pylist () == [1 , 0 , 3 , 0 ]
1678- assert result .column (1 ). to_pylist () == [4.5 , 6.7 , 0.0 , 0.0 ]
1677+ assert result .column (0 ) == pa . array ( [1 , 0 , 3 , 0 ])
1678+ assert result .column (1 ) == pa . array ( [4.5 , 6.7 , 0.0 , 0.0 ])
16791679 # String column should be filled with "0"
1680- assert result .column (2 ). to_pylist () == ["a" , "0" , "c" , "0" ]
1680+ assert result .column (2 ) == pa . array ( ["a" , "0" , "c" , "0" ])
16811681 # Boolean column should be filled with False (0 converted to bool)
1682- assert result .column (3 ). to_pylist () == [True , False , False , False ]
1682+ assert result .column (3 ) == pa . array ( [True , False , False , False ])
16831683
16841684
16851685def test_fill_null_subset (null_df ):
@@ -1690,11 +1690,131 @@ def test_fill_null_subset(null_df):
16901690 result = filled_df .collect ()[0 ]
16911691
16921692 # Check that nulls were filled only in specified columns
1693- assert result .column (0 ). to_pylist () == [1 , 0 , 3 , 0 ]
1694- assert result .column (1 ). to_pylist () == [4.5 , 6.7 , 0.0 , 0.0 ]
1693+ assert result .column (0 ) == pa . array ( [1 , 0 , 3 , 0 ])
1694+ assert result .column (1 ) == pa . array ( [4.5 , 6.7 , 0.0 , 0.0 ])
16951695 # These should still have nulls
16961696 assert None in result .column (2 ).to_pylist ()
16971697 assert None in result .column (3 ).to_pylist ()
1698+
1699+ def test_fill_null_str_column (null_df ):
1700+ """Test filling nulls in string columns with different values."""
1701+ # Fill string nulls with a replacement string
1702+ filled_df = null_df .fill_null ("N/A" , subset = ["str_col" ])
1703+
1704+ result = filled_df .collect ()[0 ]
1705+
1706+ # Check that string nulls were filled with "N/A"
1707+ assert result .column (2 ).to_pylist () == ["a" , "N/A" , "c" , "N/A" ]
1708+
1709+ # Other columns should be unchanged
1710+ assert None in result .column (0 ).to_pylist ()
1711+ assert None in result .column (1 ).to_pylist ()
1712+ assert None in result .column (3 ).to_pylist ()
1713+
1714+ # Fill with an empty string
1715+ filled_df = null_df .fill_null ("" , subset = ["str_col" ])
1716+ result = filled_df .collect ()[0 ]
1717+ assert result .column (2 ).to_pylist () == ["a" , "" , "c" , "" ]
1718+
1719+
1720+ def test_fill_null_bool_column (null_df ):
1721+ """Test filling nulls in boolean columns with different values."""
1722+ # Fill bool nulls with True
1723+ filled_df = null_df .fill_null (True , subset = ["bool_col" ])
1724+
1725+ result = filled_df .collect ()[0 ]
1726+
1727+ # Check that bool nulls were filled with True
1728+ assert result .column (3 ).to_pylist () == [True , True , False , True ]
1729+
1730+ # Other columns should be unchanged
1731+ assert None in result .column (0 ).to_pylist ()
1732+
1733+ # Fill bool nulls with False
1734+ filled_df = null_df .fill_null (False , subset = ["bool_col" ])
1735+ result = filled_df .collect ()[0 ]
1736+ assert result .column (3 ).to_pylist () == [True , False , False , False ]
1737+
1738+
1739+ def test_fill_null_date32_column (null_df ):
1740+ """Test filling nulls in date32 columns."""
1741+
1742+ # Fill date32 nulls with a specific date (1970-01-01)
1743+ epoch_date = datetime .date (1970 , 1 , 1 )
1744+ filled_df = null_df .fill_null (epoch_date , subset = ["date32_col" ])
1745+
1746+ result = filled_df .collect ()[0 ]
1747+
1748+ # Check that date32 nulls were filled with epoch date
1749+ dates = result .column (4 ).to_pylist ()
1750+ assert dates [0 ] == datetime .date (2000 , 1 , 1 ) # Original value
1751+ assert dates [1 ] == epoch_date # Filled value
1752+ assert dates [2 ] == datetime .date (2022 , 1 , 1 ) # Original value
1753+ assert dates [3 ] == epoch_date # Filled value
1754+
1755+ # Other date column should be unchanged
1756+ assert None in result .column (5 ).to_pylist ()
1757+
1758+
1759+ def test_fill_null_date64_column (null_df ):
1760+ """Test filling nulls in date64 columns."""
1761+
1762+ # Fill date64 nulls with a specific date (1970-01-01)
1763+ epoch_date = datetime .date (1970 , 1 , 1 )
1764+ filled_df = null_df .fill_null (epoch_date , subset = ["date64_col" ])
1765+
1766+ result = filled_df .collect ()[0 ]
1767+
1768+ # Check that date64 nulls were filled with epoch date
1769+ dates = result .column (5 ).to_pylist ()
1770+ assert dates [0 ] == datetime .date (2000 , 1 , 1 ) # Original value
1771+ assert dates [1 ] == epoch_date # Filled value
1772+ assert dates [2 ] == datetime .date (2022 , 1 , 1 ) # Original value
1773+ assert dates [3 ] == epoch_date # Filled value
1774+
1775+ # Other date column should be unchanged
1776+ assert None in result .column (4 ).to_pylist ()
1777+
1778+
1779+ def test_fill_null_type_coercion (null_df ):
1780+ """Test type coercion when filling nulls with values of different types."""
1781+ # Try to fill string nulls with a number
1782+ filled_df = null_df .fill_null (42 , subset = ["str_col" ])
1783+
1784+ result = filled_df .collect ()[0 ]
1785+
1786+ # String nulls should be filled with string representation of the number
1787+ assert result .column (2 ).to_pylist () == ["a" , "42" , "c" , "42" ]
1788+
1789+ # Try to fill bool nulls with a string that converts to True
1790+ filled_df = null_df .fill_null ("true" , subset = ["bool_col" ])
1791+ result = filled_df .collect ()[0 ]
1792+
1793+ # This behavior depends on the implementation - check it works without error
1794+ # but don't make assertions about exact conversion behavior
1795+ assert None not in result .column (3 ).to_pylist ()
1796+
1797+
1798+ def test_fill_null_multiple_date_columns (null_df ):
1799+ """Test filling nulls in both date column types simultaneously."""
1800+
1801+ # Fill both date column types with the same date
1802+ test_date = datetime .date (2023 , 12 , 31 )
1803+ filled_df = null_df .fill_null (test_date , subset = ["date32_col" , "date64_col" ])
1804+
1805+ result = filled_df .collect ()[0 ]
1806+
1807+ # Check both date columns were filled correctly
1808+ date32_vals = result .column (4 ).to_pylist ()
1809+ date64_vals = result .column (5 ).to_pylist ()
1810+
1811+ assert None not in date32_vals
1812+ assert None not in date64_vals
1813+
1814+ assert date32_vals [1 ] == test_date
1815+ assert date32_vals [3 ] == test_date
1816+ assert date64_vals [1 ] == test_date
1817+ assert date64_vals [3 ] == test_date
16981818
16991819
17001820def test_fill_null_specific_types (null_df ):
@@ -1705,10 +1825,13 @@ def test_fill_null_specific_types(null_df):
17051825 result = filled_df .collect ()[0 ]
17061826
17071827 # Check that nulls were filled appropriately by type
1708- assert result .column (0 ).to_pylist () == [1 , 0 , 3 , 0 ] # Int gets 0 from "missing" conversion
1709- assert result .column (1 ).to_pylist () == [4.5 , 6.7 , 0.0 , 0.0 ] # Float gets 0.0
1710- assert result .column (2 ).to_pylist () == ["a" , "missing" , "c" , "missing" ] # String gets "missing"
1711- assert result .column (3 ).to_pylist () == [True , False , False , False ] # Bool gets False
1828+
1829+ assert result .column (0 ).to_pylist () == [1 , None , 3 , None ]
1830+ assert result .column (1 ).to_pylist () == [4.5 , 6.7 , None , None ]
1831+ assert result .column (2 ).to_pylist () == ["a" , "missing" , "c" , "missing" ]
1832+ assert result .column (3 ).to_pylist () == [True , None , False , None ] # Bool gets False
1833+ assert result .column (4 ).to_pylist () == [datetime .date (2000 , 1 , 1 ), None , datetime .date (2022 , 1 , 1 ), None ]
1834+ assert result .column (5 ).to_pylist () == [datetime .date (2000 , 1 , 1 ), None , datetime .date (2022 , 1 , 1 ), None ]
17121835
17131836
17141837def test_fill_null_immutability (null_df ):
@@ -1763,7 +1886,3 @@ def test_fill_null_all_null_column(ctx):
17631886 # Check that all nulls were filled
17641887 result = filled_df .collect ()[0 ]
17651888 assert result .column (1 ).to_pylist () == ["filled" , "filled" , "filled" ]
1766-
1767- # Original should be unchanged
1768- original = all_null_df .collect ()[0 ]
1769- assert original .column (1 ).null_count == 3
0 commit comments