feat: add fill_null method to DataFrame for handling null values

kosiew · kosiew · commit 106555e9d870 · 2025-02-12T15:01:08.000+08:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -38,13 +38,13 @@
 from datafusion.plan import ExecutionPlan, LogicalPlan
 from datafusion.record_batch import RecordBatchStream
 
+
 if TYPE_CHECKING:
     import pathlib
     from typing import Callable, Sequence
 
     import pandas as pd
     import polars as pl
-    import pyarrow as pa
 
 from enum import Enum
 
@@ -853,3 +853,60 @@ def within_limit(df: DataFrame, limit: int) -> DataFrame:
             DataFrame: After applying func to the original dataframe.
         """
         return func(self, *args)
+
+    def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":
+        """Fill null values in specified columns with a value.
+        
+        Args:
+            value: Value to replace nulls with. Will be cast to match column type.
+            subset: Optional list of column names to fill. If None, fills all columns.
+        
+        Returns:
+            DataFrame with null values replaced where type casting is possible
+        
+        Examples:
+            >>> df = df.fill_null(0)  # Fill all nulls with 0 where possible 
+            >>> df = df.fill_null("missing", subset=["name", "category"])  # Fill string columns
+            
+        Notes:
+            - Only fills nulls in columns where the value can be cast to the column type
+            - For columns where casting fails, the original column is kept unchanged
+            - For columns not in subset, the original column is kept unchanged
+        """
+        import pyarrow as pa
+        from datafusion import functions as f
+        
+        # Get columns to process
+        if subset is None:
+            subset = self.schema().names
+        else:
+            schema_cols = self.schema().names
+            for col in subset:
+                if col not in schema_cols:
+                    raise ValueError(f"Column '{col}' not found in DataFrame")
+
+        # Build expressions for select
+        exprs = []
+        for col_name in self.schema().names:
+            if col_name in subset:
+                # Get column type
+                col_type = self.schema().field(col_name).type
+                
+                try:
+                    # Try casting value to column type
+                    typed_value = pa.scalar(value, type=col_type)
+                    literal_expr = f.Expr.literal(typed_value)
+                    
+                    # Build coalesce expression
+                    expr = f.coalesce(f.col(col_name), literal_expr)
+                    exprs.append(expr.alias(col_name))
+                
+                except (pa.ArrowTypeError, pa.ArrowInvalid):
+                    # If cast fails, keep original column
+                    exprs.append(f.col(col_name))
+            else:
+                # Keep columns not in subset unchanged
+                exprs.append(f.col(col_name))
+                
+        # Return new DataFrame with filled values
+        return self.select(exprs)
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1196,3 +1196,72 @@ def test_dataframe_repr_html(df) -> None:
 
     # Ignore whitespace just to make this test look cleaner
     assert output.replace(" ", "") == ref_html.replace(" ", "")
+
+
+    
+def test_fill_null(df):
+    # Test filling nulls with integer value
+    df_with_nulls = df.with_column("d", literal(None).cast(pa.int64()))    
+    df_filled = df_with_nulls.fill_null(0)
+    result = df_filled.to_pydict()
+    assert result["d"] == [0, 0, 0]
+
+    # Test filling nulls with string value
+    df_with_nulls = df.with_column("d", literal(None).cast(pa.int64()))    
+    df_filled = df_with_nulls.fill_null("missing")
+    result = df_filled.to_pydict()
+    assert result["e"] == ["missing", "missing", "missing"]
+
+    # Test filling nulls with subset of columns
+    df_with_nulls = df.with_columns(
+        literal(None).alias("d"),
+        literal(None).alias("e"),
+    )
+    df_filled = df_with_nulls.fill_null("missing", subset=["e"])
+    result = df_filled.to_pydict()
+    assert result["d"] == [None, None, None]
+    assert result["e"] == ["missing", "missing", "missing"]
+
+    # Test filling nulls with value that cannot be cast to column type
+    df_with_nulls = df.with_column("d", literal(None))
+    df_filled = df_with_nulls.fill_null("invalid")
+    result = df_filled.to_pydict()
+    assert result["d"] == [None, None, None]
+
+    # Test filling nulls with value that can be cast to some columns but not others
+    df_with_nulls = df.with_columns(
+        literal(None).alias("d"),
+        literal(None).alias("e"),
+    )
+    df_filled = df_with_nulls.fill_null(0)
+    result = df_filled.to_pydict()
+    assert result["d"] == [0, 0, 0]
+    assert result["e"] == [None, None, None]
+
+    # Test filling nulls with subset of columns where some casts fail
+    df_with_nulls = df.with_columns(
+        literal(None).alias("d"),
+        literal(None).alias("e"),
+    )
+    df_filled = df_with_nulls.fill_null(0, subset=["d", "e"])
+    result = df_filled.to_pydict()
+    assert result["d"] == [0, 0, 0]
+    assert result["e"] == [None, None, None]
+
+    # Test filling nulls with subset of columns where all casts succeed
+    df_with_nulls = df.with_columns(
+        literal(None).alias("d"),
+        literal(None).alias("e"),
+    )
+    df_filled = df_with_nulls.fill_null("missing", subset=["e"])
+    result = df_filled.to_pydict()
+    assert result["d"] == [None, None, None]
+    assert result["e"] == ["missing", "missing", "missing"]
+
+    # Test filling nulls with subset of columns where some columns do not exist
+    df_with_nulls = df.with_columns(
+        literal(None).alias("d"),
+        literal(None).alias("e"),
+    )
+    with pytest.raises(ValueError, match="Column 'f' not found in DataFrame"):
+        df_with_nulls.fill_null("missing", subset=["e", "f"])