test: ensure RecordBatch instances in DataFrame stream tests

kosiew · kosiew · commit ca13f4812f3e · 2025-09-06T14:24:25.000+08:00
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -29,6 +29,7 @@
     DataFrame,
     ParquetColumnOptions,
     ParquetWriterOptions,
+    RecordBatch,
     SessionContext,
     WindowFrame,
     column,
@@ -1504,7 +1505,8 @@ def test_to_arrow_table(df):
 
 def test_execute_stream(df):
     stream = df.execute_stream()
-    assert all(batch is not None for batch in stream)
+    batches = list(stream)
+    assert all(isinstance(batch, RecordBatch) for batch in batches)
     assert not list(stream)  # after one iteration the generator must be exhausted
 
 
@@ -1513,7 +1515,7 @@ async def test_execute_stream_async(df):
     stream = df.execute_stream()
     batches = [batch async for batch in stream]
 
-    assert all(batch is not None for batch in batches)
+    assert all(isinstance(batch, RecordBatch) for batch in batches)
 
     # After consuming all batches, the stream should be exhausted
     remaining_batches = [batch async for batch in stream]
@@ -1557,10 +1559,10 @@ async def test_execute_stream_to_arrow_table_async(df, schema):
 
 def test_execute_stream_partitioned(df):
     streams = df.execute_stream_partitioned()
-    assert all(batch is not None for stream in streams for batch in stream)
-    assert all(
-        not list(stream) for stream in streams
-    )  # after one iteration all generators must be exhausted
+    for stream in streams:
+        batches = list(stream)
+        assert all(isinstance(batch, RecordBatch) for batch in batches)
+        assert not list(stream)
 
 
 @pytest.mark.asyncio
@@ -1569,7 +1571,7 @@ async def test_execute_stream_partitioned_async(df):
 
     for stream in streams:
         batches = [batch async for batch in stream]
-        assert all(batch is not None for batch in batches)
+        assert all(isinstance(batch, RecordBatch) for batch in batches)
 
         # Ensure the stream is exhausted after iteration
         remaining_batches = [batch async for batch in stream]
@@ -1593,7 +1595,17 @@ def test_iter_batches_dataframe(fail_collect):
 
     expected = [batch1, batch2]
     for got, exp in zip(df, expected):
-        assert got.equals(exp)
+        assert isinstance(got, RecordBatch)
+        assert got.to_pyarrow().equals(exp)
+
+
+def test_table_from_batches_dataframe(df, fail_collect):
+    table = pa.Table.from_batches(df)
+    assert table.shape == (3, 3)
+    assert set(table.column_names) == {"a", "b", "c"}
+
+    for batch in df:
+        assert isinstance(batch, RecordBatch)
 
 
 def test_arrow_c_stream_to_table_and_reader(fail_collect):
@@ -1855,8 +1867,6 @@ def test_write_parquet_with_options_default_compression(df, tmp_path):
     ["gzip(6)", "brotli(7)", "zstd(15)", "snappy", "uncompressed"],
 )
 def test_write_parquet_with_options_compression(df, tmp_path, compression):
-    import re
-
     path = tmp_path
     df.write_parquet_with_options(
         str(path), ParquetWriterOptions(compression=compression)
diff --git a/python/tests/test_io.py b/python/tests/test_io.py
@@ -19,7 +19,7 @@
 
 import pyarrow as pa
 import pytest
-from datafusion import column
+from datafusion import RecordBatch, column
 from datafusion.io import read_avro, read_csv, read_json, read_parquet
 
 from .utils import range_table
@@ -131,3 +131,6 @@ def test_table_from_batches_stream(ctx, fail_collect):
     table = pa.Table.from_batches(df)
     assert table.shape == (10, 1)
     assert table.column_names == ["value"]
+
+    for batch in df:
+        assert isinstance(batch, RecordBatch)