Resolve test cases for fill_null

kosiew · kosiew · commit 4cf74963a637 · 2025-02-12T15:57:25.000+08:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -856,26 +856,26 @@ def within_limit(df: DataFrame, limit: int) -> DataFrame:
 
     def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":
         """Fill null values in specified columns with a value.
-        
+
         Args:
             value: Value to replace nulls with. Will be cast to match column type.
             subset: Optional list of column names to fill. If None, fills all columns.
-        
+
         Returns:
             DataFrame with null values replaced where type casting is possible
-        
+
         Examples:
-            >>> df = df.fill_null(0)  # Fill all nulls with 0 where possible 
+            >>> df = df.fill_null(0)  # Fill all nulls with 0 where possible
             >>> df = df.fill_null("missing", subset=["name", "category"])  # Fill string columns
-            
+
         Notes:
             - Only fills nulls in columns where the value can be cast to the column type
             - For columns where casting fails, the original column is kept unchanged
             - For columns not in subset, the original column is kept unchanged
         """
         import pyarrow as pa
         from datafusion import functions as f
-        
+
         # Get columns to process
         if subset is None:
             subset = self.schema().names
@@ -891,22 +891,21 @@ def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":
             if col_name in subset:
                 # Get column type
                 col_type = self.schema().field(col_name).type
-                
+
                 try:
                     # Try casting value to column type
                     typed_value = pa.scalar(value, type=col_type)
                     literal_expr = f.Expr.literal(typed_value)
-                    
+
                     # Build coalesce expression
                     expr = f.coalesce(f.col(col_name), literal_expr)
                     exprs.append(expr.alias(col_name))
-                
+
                 except (pa.ArrowTypeError, pa.ArrowInvalid):
                     # If cast fails, keep original column
                     exprs.append(f.col(col_name))
             else:
                 # Keep columns not in subset unchanged
                 exprs.append(f.col(col_name))
-                
-        # Return new DataFrame with filled values
-        return self.select(exprs)
+
+        return self.select(*exprs)
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1198,24 +1198,23 @@ def test_dataframe_repr_html(df) -> None:
     assert output.replace(" ", "") == ref_html.replace(" ", "")
 
 
-    
 def test_fill_null(df):
     # Test filling nulls with integer value
-    df_with_nulls = df.with_column("d", literal(None).cast(pa.int64()))    
+    df_with_nulls = df.with_column("d", literal(None).cast(pa.int64()))
     df_filled = df_with_nulls.fill_null(0)
     result = df_filled.to_pydict()
     assert result["d"] == [0, 0, 0]
 
     # Test filling nulls with string value
-    df_with_nulls = df.with_column("d", literal(None).cast(pa.string()))   
+    df_with_nulls = df.with_column("d", literal(None).cast(pa.string()))
     df_filled = df_with_nulls.fill_null("missing")
     result = df_filled.to_pydict()
-    assert result["e"] == ["missing", "missing", "missing"]
+    assert result["d"] == ["missing", "missing", "missing"]
 
     # Test filling nulls with subset of columns
     df_with_nulls = df.with_columns(
-        literal(None).alias("d"),
-        literal(None).alias("e"),
+        literal(None).cast(pa.int64()).alias("d"),
+        literal(None).cast(pa.string()).alias("e"),
     )
     df_filled = df_with_nulls.fill_null("missing", subset=["e"])
     result = df_filled.to_pydict()
@@ -1230,8 +1229,8 @@ def test_fill_null(df):
 
     # Test filling nulls with value that can be cast to some columns but not others
     df_with_nulls = df.with_columns(
-        literal(None).alias("d"),
-        literal(None).alias("e"),
+        literal(None).alias("d").cast(pa.int64()),
+        literal(None).alias("e").cast(pa.string()),
     )
     df_filled = df_with_nulls.fill_null(0)
     result = df_filled.to_pydict()
@@ -1240,8 +1239,8 @@ def test_fill_null(df):
 
     # Test filling nulls with subset of columns where some casts fail
     df_with_nulls = df.with_columns(
-        literal(None).alias("d"),
-        literal(None).alias("e"),
+        literal(None).alias("d").cast(pa.int64()),
+        literal(None).alias("e").cast(pa.string()),
     )
     df_filled = df_with_nulls.fill_null(0, subset=["d", "e"])
     result = df_filled.to_pydict()
@@ -1250,8 +1249,8 @@ def test_fill_null(df):
 
     # Test filling nulls with subset of columns where all casts succeed
     df_with_nulls = df.with_columns(
-        literal(None).alias("d"),
-        literal(None).alias("e"),
+        literal(None).alias("d").cast(pa.int64()),
+        literal(None).alias("e").cast(pa.string()),
     )
     df_filled = df_with_nulls.fill_null("missing", subset=["e"])
     result = df_filled.to_pydict()
@@ -1260,8 +1259,8 @@ def test_fill_null(df):
 
     # Test filling nulls with subset of columns where some columns do not exist
     df_with_nulls = df.with_columns(
-        literal(None).alias("d"),
-        literal(None).alias("e"),
+        literal(None).alias("d").cast(pa.int64()),
+        literal(None).alias("e").cast(pa.string()),
     )
     with pytest.raises(ValueError, match="Column 'f' not found in DataFrame"):
-        df_with_nulls.fill_null("missing", subset=["e", "f"])
+        df_with_nulls.fill_null("missing", subset=["e", "f"])
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
@@ -1174,16 +1174,19 @@ def test_between_default(df):
     actual = df.collect()[0].to_pydict()
     assert actual == expected
 
+
 def test_coalesce(df):
     # Create a DataFrame with null values
     ctx = SessionContext()
     batch = pa.RecordBatch.from_arrays(
         [
             pa.array(["Hello", None, "!"]),  # string column with null
-            pa.array([4, None, 6]),          # integer column with null
-            pa.array(["hello ", None, " !"]), # string column with null
-            pa.array([datetime(2022, 12, 31), None, datetime(2020, 7, 2)]),  # datetime with null
-            pa.array([False, None, True]),    # boolean column with null
+            pa.array([4, None, 6]),  # integer column with null
+            pa.array(["hello ", None, " !"]),  # string column with null
+            pa.array(
+                [datetime(2022, 12, 31), None, datetime(2020, 7, 2)]
+            ),  # datetime with null
+            pa.array([False, None, True]),  # boolean column with null
         ],
         names=["a", "b", "c", "d", "e"],
     )
@@ -1197,13 +1200,17 @@ def test_coalesce(df):
         f.coalesce(column("d"), literal(datetime(2000, 1, 1))).alias("d_coalesced"),
         f.coalesce(column("e"), literal(False)).alias("e_coalesced"),
     )
-    
+
     result = result_df.collect()[0]
 
     # Verify results
-    assert result.column(0) == pa.array(["Hello", "default", "!"], type=pa.string_view())
+    assert result.column(0) == pa.array(
+        ["Hello", "default", "!"], type=pa.string_view()
+    )
     assert result.column(1) == pa.array([4, 0, 6], type=pa.int64())
-    assert result.column(2) == pa.array(["hello ", "default", " !"], type=pa.string_view())
+    assert result.column(2) == pa.array(
+        ["hello ", "default", " !"], type=pa.string_view()
+    )
     assert result.column(3) == pa.array(
         [datetime(2022, 12, 31), datetime(2000, 1, 1), datetime(2020, 7, 2)],
         type=pa.timestamp("us"),
@@ -1212,7 +1219,11 @@ def test_coalesce(df):
 
     # Test multiple arguments
     result_df = df_with_nulls.select(
-        f.coalesce(column("a"), literal(None), literal("fallback")).alias("multi_coalesce")
+        f.coalesce(column("a"), literal(None), literal("fallback")).alias(
+            "multi_coalesce"
+        )
     )
     result = result_df.collect()[0]
-    assert result.column(0) == pa.array(["Hello", "fallback", "!"], type=pa.string_view())
+    assert result.column(0) == pa.array(
+        ["Hello", "fallback", "!"], type=pa.string_view()
+    )