Skip to content

Commit ddc918d

Browse files
timsaucerclaude
andcommitted
Improve docstrings and test robustness for new DataFrame methods
Clarify except_distinct/intersect_distinct docstrings, add deterministic sort to test_window, add sort_by ascending verification test, and add smoke tests for PGJSON and GRAPHVIZ explain formats. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent e10358f commit ddc918d

File tree

2 files changed

+30
-7
lines changed

2 files changed

+30
-7
lines changed

python/datafusion/dataframe.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,8 +1082,8 @@ def except_distinct(self, other: DataFrame) -> DataFrame:
10821082
"""Calculate the set difference with deduplication.
10831083
10841084
Returns rows that are in this DataFrame but not in ``other``,
1085-
removing any duplicates. This is the complement of :py:meth:`except_all`
1086-
which preserves duplicates.
1085+
removing any duplicates. In contrast, :py:meth:`except_all` preserves
1086+
duplicate rows.
10871087
10881088
The two :py:class:`DataFrame` must have exactly the same schema.
10891089
@@ -1098,8 +1098,8 @@ def except_distinct(self, other: DataFrame) -> DataFrame:
10981098
def intersect_distinct(self, other: DataFrame) -> DataFrame:
10991099
"""Calculate the intersection with deduplication.
11001100
1101-
Returns distinct rows that appear in both DataFrames. This is the
1102-
complement of :py:meth:`intersect` which preserves duplicates.
1101+
Returns distinct rows that appear in both DataFrames. In contrast,
1102+
:py:meth:`intersect` preserves duplicate rows.
11031103
11041104
The two :py:class:`DataFrame` must have exactly the same schema.
11051105

python/tests/test_dataframe.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3631,6 +3631,15 @@ def test_sort_by():
36313631
ctx = SessionContext()
36323632
df = ctx.from_pydict({"a": [3, 1, 2]})
36333633
result = df.sort_by(column("a")).collect()[0]
3634+
# sort_by always sorts ascending with nulls last
3635+
assert result.column(0).to_pylist() == [1, 2, 3]
3636+
3637+
3638+
def test_sort_by_is_always_ascending():
3639+
"""Verify sort_by uses ascending order regardless of input order."""
3640+
ctx = SessionContext()
3641+
df = ctx.from_pydict({"a": [1, 2, 3]})
3642+
result = df.sort_by(column("a")).collect()[0]
36343643
assert result.column(0).to_pylist() == [1, 2, 3]
36353644

36363645

@@ -3655,13 +3664,27 @@ def test_explain_with_format(capsys):
36553664
captured = capsys.readouterr()
36563665
assert "plan_type" in captured.out
36573666

3667+
# PGJSON format produces valid output
3668+
df.explain(format=ExplainFormat.PGJSON)
3669+
captured = capsys.readouterr()
3670+
assert "plan_type" in captured.out
3671+
3672+
# Graphviz format produces DOT output
3673+
df.explain(format=ExplainFormat.GRAPHVIZ)
3674+
captured = capsys.readouterr()
3675+
assert "plan_type" in captured.out
3676+
36583677

36593678
def test_window():
36603679
ctx = SessionContext()
36613680
df = ctx.from_pydict({"a": [1, 2, 3], "b": ["x", "x", "y"]})
3662-
result = df.window(
3663-
f.row_number(partition_by=[column("b")], order_by=[column("a")]).alias("rn")
3664-
).collect()[0]
3681+
result = (
3682+
df.window(
3683+
f.row_number(partition_by=[column("b")], order_by=[column("a")]).alias("rn")
3684+
)
3685+
.sort(column("a").sort(ascending=True))
3686+
.collect()[0]
3687+
)
36653688
assert "rn" in result.schema.names
36663689
assert result.column(result.schema.get_field_index("rn")).to_pylist() == [1, 2, 1]
36673690

0 commit comments

Comments
 (0)