Improve docstrings and test robustness for new DataFrame methods

timsaucer · claude · timsaucer · commit ddc918df0a56 · 2026-04-03T16:36:48.000-04:00
Clarify except_distinct/intersect_distinct docstrings, add deterministic
sort to test_window, add sort_by ascending verification test, and add
smoke tests for PGJSON and GRAPHVIZ explain formats.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1082,8 +1082,8 @@ def except_distinct(self, other: DataFrame) -> DataFrame:
         """Calculate the set difference with deduplication.
 
         Returns rows that are in this DataFrame but not in ``other``,
-        removing any duplicates. This is the complement of :py:meth:`except_all`
-        which preserves duplicates.
+        removing any duplicates. In contrast, :py:meth:`except_all` preserves
+        duplicate rows.
 
         The two :py:class:`DataFrame` must have exactly the same schema.
 
@@ -1098,8 +1098,8 @@ def except_distinct(self, other: DataFrame) -> DataFrame:
     def intersect_distinct(self, other: DataFrame) -> DataFrame:
         """Calculate the intersection with deduplication.
 
-        Returns distinct rows that appear in both DataFrames. This is the
-        complement of :py:meth:`intersect` which preserves duplicates.
+        Returns distinct rows that appear in both DataFrames. In contrast,
+        :py:meth:`intersect` preserves duplicate rows.
 
         The two :py:class:`DataFrame` must have exactly the same schema.
 
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -3631,6 +3631,15 @@ def test_sort_by():
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [3, 1, 2]})
     result = df.sort_by(column("a")).collect()[0]
+    # sort_by always sorts ascending with nulls last
+    assert result.column(0).to_pylist() == [1, 2, 3]
+
+
+def test_sort_by_is_always_ascending():
+    """Verify sort_by uses ascending order regardless of input order."""
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": [1, 2, 3]})
+    result = df.sort_by(column("a")).collect()[0]
     assert result.column(0).to_pylist() == [1, 2, 3]
 
 
@@ -3655,13 +3664,27 @@ def test_explain_with_format(capsys):
     captured = capsys.readouterr()
     assert "plan_type" in captured.out
 
+    # PGJSON format produces valid output
+    df.explain(format=ExplainFormat.PGJSON)
+    captured = capsys.readouterr()
+    assert "plan_type" in captured.out
+
+    # Graphviz format produces DOT output
+    df.explain(format=ExplainFormat.GRAPHVIZ)
+    captured = capsys.readouterr()
+    assert "plan_type" in captured.out
+
 
 def test_window():
     ctx = SessionContext()
     df = ctx.from_pydict({"a": [1, 2, 3], "b": ["x", "x", "y"]})
-    result = df.window(
-        f.row_number(partition_by=[column("b")], order_by=[column("a")]).alias("rn")
-    ).collect()[0]
+    result = (
+        df.window(
+            f.row_number(partition_by=[column("b")], order_by=[column("a")]).alias("rn")
+        )
+        .sort(column("a").sort(ascending=True))
+        .collect()[0]
+    )
     assert "rn" in result.schema.names
     assert result.column(result.schema.get_field_index("rn")).to_pylist() == [1, 2, 1]