Skip to content

Commit 4cf7496

Browse files
committed
Resolve test cases for fill_null
1 parent cff9b7c commit 4cf7496

File tree

3 files changed

+45
-36
lines changed

3 files changed

+45
-36
lines changed

python/datafusion/dataframe.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -856,26 +856,26 @@ def within_limit(df: DataFrame, limit: int) -> DataFrame:
856856

857857
def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":
858858
"""Fill null values in specified columns with a value.
859-
859+
860860
Args:
861861
value: Value to replace nulls with. Will be cast to match column type.
862862
subset: Optional list of column names to fill. If None, fills all columns.
863-
863+
864864
Returns:
865865
DataFrame with null values replaced where type casting is possible
866-
866+
867867
Examples:
868-
>>> df = df.fill_null(0) # Fill all nulls with 0 where possible
868+
>>> df = df.fill_null(0) # Fill all nulls with 0 where possible
869869
>>> df = df.fill_null("missing", subset=["name", "category"]) # Fill string columns
870-
870+
871871
Notes:
872872
- Only fills nulls in columns where the value can be cast to the column type
873873
- For columns where casting fails, the original column is kept unchanged
874874
- For columns not in subset, the original column is kept unchanged
875875
"""
876876
import pyarrow as pa
877877
from datafusion import functions as f
878-
878+
879879
# Get columns to process
880880
if subset is None:
881881
subset = self.schema().names
@@ -891,22 +891,21 @@ def fill_null(self, value: Any, subset: list[str] | None = None) -> "DataFrame":
891891
if col_name in subset:
892892
# Get column type
893893
col_type = self.schema().field(col_name).type
894-
894+
895895
try:
896896
# Try casting value to column type
897897
typed_value = pa.scalar(value, type=col_type)
898898
literal_expr = f.Expr.literal(typed_value)
899-
899+
900900
# Build coalesce expression
901901
expr = f.coalesce(f.col(col_name), literal_expr)
902902
exprs.append(expr.alias(col_name))
903-
903+
904904
except (pa.ArrowTypeError, pa.ArrowInvalid):
905905
# If cast fails, keep original column
906906
exprs.append(f.col(col_name))
907907
else:
908908
# Keep columns not in subset unchanged
909909
exprs.append(f.col(col_name))
910-
911-
# Return new DataFrame with filled values
912-
return self.select(exprs)
910+
911+
return self.select(*exprs)

python/tests/test_dataframe.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1198,24 +1198,23 @@ def test_dataframe_repr_html(df) -> None:
11981198
assert output.replace(" ", "") == ref_html.replace(" ", "")
11991199

12001200

1201-
12021201
def test_fill_null(df):
12031202
# Test filling nulls with integer value
1204-
df_with_nulls = df.with_column("d", literal(None).cast(pa.int64()))
1203+
df_with_nulls = df.with_column("d", literal(None).cast(pa.int64()))
12051204
df_filled = df_with_nulls.fill_null(0)
12061205
result = df_filled.to_pydict()
12071206
assert result["d"] == [0, 0, 0]
12081207

12091208
# Test filling nulls with string value
1210-
df_with_nulls = df.with_column("d", literal(None).cast(pa.string()))
1209+
df_with_nulls = df.with_column("d", literal(None).cast(pa.string()))
12111210
df_filled = df_with_nulls.fill_null("missing")
12121211
result = df_filled.to_pydict()
1213-
assert result["e"] == ["missing", "missing", "missing"]
1212+
assert result["d"] == ["missing", "missing", "missing"]
12141213

12151214
# Test filling nulls with subset of columns
12161215
df_with_nulls = df.with_columns(
1217-
literal(None).alias("d"),
1218-
literal(None).alias("e"),
1216+
literal(None).cast(pa.int64()).alias("d"),
1217+
literal(None).cast(pa.string()).alias("e"),
12191218
)
12201219
df_filled = df_with_nulls.fill_null("missing", subset=["e"])
12211220
result = df_filled.to_pydict()
@@ -1230,8 +1229,8 @@ def test_fill_null(df):
12301229

12311230
# Test filling nulls with value that can be cast to some columns but not others
12321231
df_with_nulls = df.with_columns(
1233-
literal(None).alias("d"),
1234-
literal(None).alias("e"),
1232+
literal(None).alias("d").cast(pa.int64()),
1233+
literal(None).alias("e").cast(pa.string()),
12351234
)
12361235
df_filled = df_with_nulls.fill_null(0)
12371236
result = df_filled.to_pydict()
@@ -1240,8 +1239,8 @@ def test_fill_null(df):
12401239

12411240
# Test filling nulls with subset of columns where some casts fail
12421241
df_with_nulls = df.with_columns(
1243-
literal(None).alias("d"),
1244-
literal(None).alias("e"),
1242+
literal(None).alias("d").cast(pa.int64()),
1243+
literal(None).alias("e").cast(pa.string()),
12451244
)
12461245
df_filled = df_with_nulls.fill_null(0, subset=["d", "e"])
12471246
result = df_filled.to_pydict()
@@ -1250,8 +1249,8 @@ def test_fill_null(df):
12501249

12511250
# Test filling nulls with subset of columns where all casts succeed
12521251
df_with_nulls = df.with_columns(
1253-
literal(None).alias("d"),
1254-
literal(None).alias("e"),
1252+
literal(None).alias("d").cast(pa.int64()),
1253+
literal(None).alias("e").cast(pa.string()),
12551254
)
12561255
df_filled = df_with_nulls.fill_null("missing", subset=["e"])
12571256
result = df_filled.to_pydict()
@@ -1260,8 +1259,8 @@ def test_fill_null(df):
12601259

12611260
# Test filling nulls with subset of columns where some columns do not exist
12621261
df_with_nulls = df.with_columns(
1263-
literal(None).alias("d"),
1264-
literal(None).alias("e"),
1262+
literal(None).alias("d").cast(pa.int64()),
1263+
literal(None).alias("e").cast(pa.string()),
12651264
)
12661265
with pytest.raises(ValueError, match="Column 'f' not found in DataFrame"):
1267-
df_with_nulls.fill_null("missing", subset=["e", "f"])
1266+
df_with_nulls.fill_null("missing", subset=["e", "f"])

python/tests/test_functions.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,16 +1174,19 @@ def test_between_default(df):
11741174
actual = df.collect()[0].to_pydict()
11751175
assert actual == expected
11761176

1177+
11771178
def test_coalesce(df):
11781179
# Create a DataFrame with null values
11791180
ctx = SessionContext()
11801181
batch = pa.RecordBatch.from_arrays(
11811182
[
11821183
pa.array(["Hello", None, "!"]), # string column with null
1183-
pa.array([4, None, 6]), # integer column with null
1184-
pa.array(["hello ", None, " !"]), # string column with null
1185-
pa.array([datetime(2022, 12, 31), None, datetime(2020, 7, 2)]), # datetime with null
1186-
pa.array([False, None, True]), # boolean column with null
1184+
pa.array([4, None, 6]), # integer column with null
1185+
pa.array(["hello ", None, " !"]), # string column with null
1186+
pa.array(
1187+
[datetime(2022, 12, 31), None, datetime(2020, 7, 2)]
1188+
), # datetime with null
1189+
pa.array([False, None, True]), # boolean column with null
11871190
],
11881191
names=["a", "b", "c", "d", "e"],
11891192
)
@@ -1197,13 +1200,17 @@ def test_coalesce(df):
11971200
f.coalesce(column("d"), literal(datetime(2000, 1, 1))).alias("d_coalesced"),
11981201
f.coalesce(column("e"), literal(False)).alias("e_coalesced"),
11991202
)
1200-
1203+
12011204
result = result_df.collect()[0]
12021205

12031206
# Verify results
1204-
assert result.column(0) == pa.array(["Hello", "default", "!"], type=pa.string_view())
1207+
assert result.column(0) == pa.array(
1208+
["Hello", "default", "!"], type=pa.string_view()
1209+
)
12051210
assert result.column(1) == pa.array([4, 0, 6], type=pa.int64())
1206-
assert result.column(2) == pa.array(["hello ", "default", " !"], type=pa.string_view())
1211+
assert result.column(2) == pa.array(
1212+
["hello ", "default", " !"], type=pa.string_view()
1213+
)
12071214
assert result.column(3) == pa.array(
12081215
[datetime(2022, 12, 31), datetime(2000, 1, 1), datetime(2020, 7, 2)],
12091216
type=pa.timestamp("us"),
@@ -1212,7 +1219,11 @@ def test_coalesce(df):
12121219

12131220
# Test multiple arguments
12141221
result_df = df_with_nulls.select(
1215-
f.coalesce(column("a"), literal(None), literal("fallback")).alias("multi_coalesce")
1222+
f.coalesce(column("a"), literal(None), literal("fallback")).alias(
1223+
"multi_coalesce"
1224+
)
12161225
)
12171226
result = result_df.collect()[0]
1218-
assert result.column(0) == pa.array(["Hello", "fallback", "!"], type=pa.string_view())
1227+
assert result.column(0) == pa.array(
1228+
["Hello", "fallback", "!"], type=pa.string_view()
1229+
)

0 commit comments

Comments
 (0)