feat: enhance Arrow interoperability by exposing DataFrame results via C Stream interface and updating tests for RecordBatch compatibility

kosiew · kosiew · commit a0fc7317cb79 · 2025-09-06T18:24:23.000+08:00
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -1136,10 +1136,16 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
     def __iter__(self) -> Iterator[RecordBatch]:
         """Yield record batches from this DataFrame lazily.
 
-        This delegates to :py:meth:`to_stream` without eagerly materializing the
-        entire result set.
+        This implementation exposes DataFrame results via Arrow's C Stream
+        interface so that PyArrow consumers such as
+        :py:meth:`pyarrow.Table.from_batches` detect and use
+        :py:meth:`__arrow_c_stream__` instead of iterating row by row in
+        Python.
         """
-        return iter(self.to_stream())
+        import pyarrow as pa
+
+        reader = pa.RecordBatchReader._import_from_c_capsule(self.__arrow_c_stream__())
+        yield from reader
 
     def __aiter__(self) -> AsyncIterator[RecordBatch]:
         """Asynchronously yield record batches from this DataFrame lazily."""
diff --git a/python/tests/test_arrow_interop.py b/python/tests/test_arrow_interop.py
@@ -1,6 +1,8 @@
 import pyarrow as pa
 import pytest
 
+from .utils import range_table
+
 
 def test_table_from_batches_with_dataframe(ctx):
     batch1 = pa.record_batch({"a": pa.array([1, 2]), "b": pa.array(["x", "y"])})
@@ -30,3 +32,19 @@ def test_table_from_batches_with_record_batch(ctx):
 
     expected = pa.Table.from_batches([batch])
     assert table.equals(expected)
+
+
+def test_table_from_batches_with_range_table(ctx):
+    df = range_table(ctx, 0, 5)
+
+    try:
+        table = pa.Table.from_batches(df)
+    except TypeError as err:  # pragma: no cover - failure path
+        pytest.fail(
+            f"TypeError raised when converting range DataFrame to Arrow Table: {err}"
+        )
+
+    # Create a schema with non-nullable field to match the actual output
+    schema = pa.schema([pa.field("value", pa.int64(), nullable=False)])
+    expected = pa.table({"value": pa.array(range(5), type=pa.int64())}, schema=schema)
+    assert table.equals(expected)
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -1595,8 +1595,8 @@ def test_iter_batches_dataframe(fail_collect):
 
     expected = [batch1, batch2]
     for got, exp in zip(df, expected):
-        assert isinstance(got, RecordBatch)
-        assert got.to_pyarrow().equals(exp)
+        assert isinstance(got, pa.RecordBatch)
+        assert got.equals(exp)
 
 
 def test_table_from_batches_dataframe(df, fail_collect):
@@ -1605,7 +1605,7 @@ def test_table_from_batches_dataframe(df, fail_collect):
     assert set(table.column_names) == {"a", "b", "c"}
 
     for batch in df:
-        assert isinstance(batch, RecordBatch)
+        assert isinstance(batch, pa.RecordBatch)
 
 
 def test_arrow_c_stream_to_table_and_reader(fail_collect):
diff --git a/python/tests/test_dataframe_iter_stream.py b/python/tests/test_dataframe_iter_stream.py
@@ -16,13 +16,13 @@
 # under the License.
 
 
-from datafusion.record_batch import RecordBatch
+import pyarrow as pa
 
 
 def test_to_stream(ctx):
     df = ctx.from_pydict({"a": [1, 2]})
     stream = df.to_stream()
-    batches = [rb.to_pyarrow() for rb in stream]
+    batches = list(stream)
     assert len(batches) == 1
     assert batches[0].to_pydict() == {"a": [1, 2]}
 
@@ -31,5 +31,5 @@ def test_dataframe_iter(ctx):
     df = ctx.from_pydict({"a": [1, 2]})
     batches = list(df)
     assert len(batches) == 1
-    assert isinstance(batches[0], RecordBatch)
-    assert batches[0].to_pyarrow().to_pydict() == {"a": [1, 2]}
+    assert isinstance(batches[0], pa.RecordBatch)
+    assert batches[0].to_pydict() == {"a": [1, 2]}
diff --git a/python/tests/test_io.py b/python/tests/test_io.py
@@ -19,7 +19,7 @@
 
 import pyarrow as pa
 import pytest
-from datafusion import RecordBatch, column
+from datafusion import column
 from datafusion.io import read_avro, read_csv, read_json, read_parquet
 
 from .utils import range_table
@@ -133,4 +133,4 @@ def test_table_from_batches_stream(ctx, fail_collect):
     assert table.column_names == ["value"]
 
     for batch in df:
-        assert isinstance(batch, RecordBatch)
+        assert isinstance(batch, pa.RecordBatch)