Skip to content

Commit 106555e

Browse files
committed
feat: add fill_null method to DataFrame for handling null values
1 parent d635d56 commit 106555e

File tree

2 files changed

+127
-1
lines changed

2 files changed

+127
-1
lines changed

python/datafusion/dataframe.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,13 @@
3838
from datafusion.plan import ExecutionPlan, LogicalPlan
3939
from datafusion.record_batch import RecordBatchStream
4040

41+
4142
if TYPE_CHECKING:
4243
import pathlib
4344
from typing import Callable, Sequence
4445

4546
import pandas as pd
4647
import polars as pl
47-
import pyarrow as pa
4848

4949
from enum import Enum
5050

@@ -853,3 +853,60 @@ def within_limit(df: DataFrame, limit: int) -> DataFrame:
853853
DataFrame: After applying func to the original dataframe.
854854
"""
855855
return func(self, *args)
856+
857+
def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":
858+
"""Fill null values in specified columns with a value.
859+
860+
Args:
861+
value: Value to replace nulls with. Will be cast to match column type.
862+
subset: Optional list of column names to fill. If None, fills all columns.
863+
864+
Returns:
865+
DataFrame with null values replaced where type casting is possible
866+
867+
Examples:
868+
>>> df = df.fill_null(0) # Fill all nulls with 0 where possible
869+
>>> df = df.fill_null("missing", subset=["name", "category"]) # Fill string columns
870+
871+
Notes:
872+
- Only fills nulls in columns where the value can be cast to the column type
873+
- For columns where casting fails, the original column is kept unchanged
874+
- For columns not in subset, the original column is kept unchanged
875+
"""
876+
import pyarrow as pa
877+
from datafusion import functions as f
878+
879+
# Get columns to process
880+
if subset is None:
881+
subset = self.schema().names
882+
else:
883+
schema_cols = self.schema().names
884+
for col in subset:
885+
if col not in schema_cols:
886+
raise ValueError(f"Column '{col}' not found in DataFrame")
887+
888+
# Build expressions for select
889+
exprs = []
890+
for col_name in self.schema().names:
891+
if col_name in subset:
892+
# Get column type
893+
col_type = self.schema().field(col_name).type
894+
895+
try:
896+
# Try casting value to column type
897+
typed_value = pa.scalar(value, type=col_type)
898+
literal_expr = f.Expr.literal(typed_value)
899+
900+
# Build coalesce expression
901+
expr = f.coalesce(f.col(col_name), literal_expr)
902+
exprs.append(expr.alias(col_name))
903+
904+
except (pa.ArrowTypeError, pa.ArrowInvalid):
905+
# If cast fails, keep original column
906+
exprs.append(f.col(col_name))
907+
else:
908+
# Keep columns not in subset unchanged
909+
exprs.append(f.col(col_name))
910+
911+
# Return new DataFrame with filled values
912+
return self.select(exprs)

python/tests/test_dataframe.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,3 +1196,72 @@ def test_dataframe_repr_html(df) -> None:
11961196

11971197
# Ignore whitespace just to make this test look cleaner
11981198
assert output.replace(" ", "") == ref_html.replace(" ", "")
1199+
1200+
1201+
1202+
def test_fill_null(df):
1203+
# Test filling nulls with integer value
1204+
df_with_nulls = df.with_column("d", literal(None).cast(pa.int64()))
1205+
df_filled = df_with_nulls.fill_null(0)
1206+
result = df_filled.to_pydict()
1207+
assert result["d"] == [0, 0, 0]
1208+
1209+
# Test filling nulls with string value
1210+
df_with_nulls = df.with_column("d", literal(None).cast(pa.int64()))
1211+
df_filled = df_with_nulls.fill_null("missing")
1212+
result = df_filled.to_pydict()
1213+
assert result["e"] == ["missing", "missing", "missing"]
1214+
1215+
# Test filling nulls with subset of columns
1216+
df_with_nulls = df.with_columns(
1217+
literal(None).alias("d"),
1218+
literal(None).alias("e"),
1219+
)
1220+
df_filled = df_with_nulls.fill_null("missing", subset=["e"])
1221+
result = df_filled.to_pydict()
1222+
assert result["d"] == [None, None, None]
1223+
assert result["e"] == ["missing", "missing", "missing"]
1224+
1225+
# Test filling nulls with value that cannot be cast to column type
1226+
df_with_nulls = df.with_column("d", literal(None))
1227+
df_filled = df_with_nulls.fill_null("invalid")
1228+
result = df_filled.to_pydict()
1229+
assert result["d"] == [None, None, None]
1230+
1231+
# Test filling nulls with value that can be cast to some columns but not others
1232+
df_with_nulls = df.with_columns(
1233+
literal(None).alias("d"),
1234+
literal(None).alias("e"),
1235+
)
1236+
df_filled = df_with_nulls.fill_null(0)
1237+
result = df_filled.to_pydict()
1238+
assert result["d"] == [0, 0, 0]
1239+
assert result["e"] == [None, None, None]
1240+
1241+
# Test filling nulls with subset of columns where some casts fail
1242+
df_with_nulls = df.with_columns(
1243+
literal(None).alias("d"),
1244+
literal(None).alias("e"),
1245+
)
1246+
df_filled = df_with_nulls.fill_null(0, subset=["d", "e"])
1247+
result = df_filled.to_pydict()
1248+
assert result["d"] == [0, 0, 0]
1249+
assert result["e"] == [None, None, None]
1250+
1251+
# Test filling nulls with subset of columns where all casts succeed
1252+
df_with_nulls = df.with_columns(
1253+
literal(None).alias("d"),
1254+
literal(None).alias("e"),
1255+
)
1256+
df_filled = df_with_nulls.fill_null("missing", subset=["e"])
1257+
result = df_filled.to_pydict()
1258+
assert result["d"] == [None, None, None]
1259+
assert result["e"] == ["missing", "missing", "missing"]
1260+
1261+
# Test filling nulls with subset of columns where some columns do not exist
1262+
df_with_nulls = df.with_columns(
1263+
literal(None).alias("d"),
1264+
literal(None).alias("e"),
1265+
)
1266+
with pytest.raises(ValueError, match="Column 'f' not found in DataFrame"):
1267+
df_with_nulls.fill_null("missing", subset=["e", "f"])

0 commit comments

Comments
 (0)