kosiew
diff --git a/‎Cargo.lock‎
Lines changed: 297 additions & 285 deletions b/‎Cargo.lock‎
Lines changed: 297 additions & 285 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 2 additions & 1 deletion b/‎Cargo.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/collect_gil_bench.py‎
Lines changed: 24 additions & 0 deletions b/‎benchmarks/collect_gil_bench.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎docs/source/user-guide/configuration.rst‎
Lines changed: 21 additions & 0 deletions b/‎docs/source/user-guide/configuration.rst‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎docs/source/user-guide/dataframe/collect-gil.md‎
Lines changed: 26 additions & 0 deletions b/‎docs/source/user-guide/dataframe/collect-gil.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 32 additions & 9 deletions b/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 32 additions & 9 deletions
diff --git a/‎docs/source/user-guide/io/arrow.rst‎
Lines changed: 24 additions & 7 deletions b/‎docs/source/user-guide/io/arrow.rst‎
Lines changed: 24 additions & 7 deletions
diff --git a/‎python/datafusion/context.py‎
Lines changed: 7 additions & 1 deletion b/‎python/datafusion/context.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎python/tests/test_dataframe.py‎
Lines changed: 57 additions & 23 deletions b/‎python/tests/test_dataframe.py‎
Lines changed: 57 additions & 23 deletions
diff --git a/‎python/tests/test_record_batch_stream.py‎
Lines changed: 18 additions & 0 deletions b/‎python/tests/test_record_batch_stream.py‎
Lines changed: 18 additions & 0 deletions
@@ -26,7 +26,7 @@ readme = "README.md"
 license = "Apache-2.0"
 edition = "2021"
 rust-version = "1.78"
-include = ["/src", "/datafusion", "/LICENSE.txt", "build.rs", "pyproject.toml", "Cargo.toml", "Cargo.lock"]
+include = ["/src", "/datafusion", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"]
 
 [features]
 default = ["mimalloc"]
@@ -48,6 +48,7 @@ uuid = { version = "1.18", features = ["v4"] }
 mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }
 async-trait = "0.1.89"
 futures = "0.3"
+rayon = "1.10"
 object_store = { version = "0.12.3", features = ["aws", "gcp", "azure", "http"] }
 url = "2"
 log = "0.4.27"
 
@@ -0,0 +1,24 @@
+import time
+
+import pyarrow as pa
+from datafusion import SessionContext
+
+
+def run(n_batches: int = 8, batch_size: int = 1_000_000) -> None:
+    ctx = SessionContext()
+    batches = []
+    for i in range(n_batches):
+        start = i * batch_size
+        arr = pa.array(range(start, start + batch_size))
+        batches.append(pa.record_batch([arr], names=["a"]))
+
+    df = ctx.create_dataframe([batches])
+
+    start = time.perf_counter()
+    df.collect()
+    duration = time.perf_counter() - start
+    print(f"{n_batches} batches collected in {duration:.3f}s")
+
+
+if __name__ == "__main__":
+    run()
@@ -47,5 +47,26 @@ a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.conte
     print(ctx)
 
 
+.. _target_partitions:
+
+Target partitions and threads
+-----------------------------
+
+The :py:meth:`~datafusion.context.SessionConfig.with_target_partitions` method
+controls how many partitions DataFusion uses when executing a query. Each
+partition is processed on its own thread, so this setting effectively limits
+the number of threads that will be scheduled.
+
+For most workloads a good starting value is the number of logical CPU cores on
+your machine. You can use :func:`os.cpu_count` to automatically configure this::
+
+    import os
+    config = SessionConfig().with_target_partitions(os.cpu_count())
+
+Choosing a value significantly higher than the available cores can lead to
+excessive context switching without performance gains, while a much lower value
+may underutilize the machine.
+
+
 You can read more about available :py:class:`~datafusion.context.SessionConfig` options in the `rust DataFusion Configuration guide <https://arrow.apache.org/datafusion/user-guide/configs.html>`_,
 and about :code:`RuntimeEnvBuilder` options in the rust `online API documentation <https://docs.rs/datafusion/latest/datafusion/execution/runtime_env/struct.RuntimeEnvBuilder.html>`_.
@@ -0,0 +1,26 @@
+# RecordBatch conversion and the GIL
+
+Profiling `DataFrame.collect` showed that converting each `RecordBatch` to
+PyArrow via `rb.to_pyarrow(py)` spent considerable time holding the Python GIL.
+Using `py-spy` on a query returning many batches indicated that more than
+95 % of the conversion executed while the GIL was held, meaning the work was
+effectively serialised.
+For queries that return many batches this limited CPU utilisation because only
+one conversion could run at a time.
+
+The implementation now converts each batch to Arrow's C data (schema/array)
+while the GIL is released, acquiring the GIL only to wrap those pointers into
+PyArrow objects. This allows the CPU intensive portions of the conversion to
+run fully in parallel.
+
+A simple benchmark is provided in `benchmarks/collect_gil_bench.py`.
+Run it twice to compare serial and parallel conversions:
+
+```bash
+RAYON_NUM_THREADS=1 python benchmarks/collect_gil_bench.py   # serial
+python benchmarks/collect_gil_bench.py                      # parallel
+```
+
+On this container, collecting 128 1 M‑row batches took around 1.5 s with
+`RAYON_NUM_THREADS=1` and 0.8 s with the default thread pool, demonstrating
+that releasing the GIL allows conversions to run in parallel.
@@ -25,8 +25,10 @@ The ``DataFrame`` class is the core abstraction in DataFusion that represents ta
 on that data. DataFrames provide a flexible API for transforming data through various operations such as
 filtering, projection, aggregation, joining, and more.
 
-A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when 
-terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called.
+A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when
+terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called. ``collect()`` loads
+all record batches into Python memory; for large results you may want to stream data instead using
+``execute_stream()`` or ``__arrow_c_stream__()``.
 
 Creating DataFrames
 -------------------
@@ -128,27 +130,47 @@ DataFusion's DataFrame API offers a wide range of operations:
 
 Terminal Operations
 -------------------
-
-To materialize the results of your DataFrame operations:
+``collect()`` materializes every record batch in Python. While convenient, this
+eagerly loads the full result set into memory and can overwhelm the Python
+process for large queries. Alternatives that stream data from Rust avoid this
+memory growth:
 
 .. code-block:: python
 
-    # Collect all data as PyArrow RecordBatches
+    # Collect all data as PyArrow RecordBatches (loads entire result set)
     result_batches = df.collect()
-    
-    # Convert to various formats
+
+    # Stream batches using the native API
+    stream = df.execute_stream()
+    for batch in stream:
+        ...  # process each RecordBatch
+
+    # Stream via the Arrow C Data Interface
+    import pyarrow as pa
+    reader = pa.ipc.RecordBatchStreamReader._import_from_c(df.__arrow_c_stream__())
+    for batch in reader:
+        ...
+
+    # Convert to various formats (also load all data into memory)
     pandas_df = df.to_pandas()        # Pandas DataFrame
     polars_df = df.to_polars()        # Polars DataFrame
     arrow_table = df.to_arrow_table() # PyArrow Table
     py_dict = df.to_pydict()          # Python dictionary
     py_list = df.to_pylist()          # Python list of dictionaries
-    
+
     # Display results
     df.show()                         # Print tabular format to console
-    
+
     # Count rows
     count = df.count()
 
+For large outputs, prefer engine-level writers such as ``df.write_parquet()``
+or other DataFusion writers. These stream data directly to the destination and
+avoid buffering the entire dataset in Python.
+
+For more on parallel record batch conversion and the Python GIL, see
+:doc:`collect-gil`.
+
 HTML Rendering
 --------------
 
@@ -207,3 +229,4 @@ For a complete list of available functions, see the :py:mod:`datafusion.function
    :maxdepth: 1
 
    rendering
+   collect-gil
@@ -57,17 +57,34 @@ and returns a ``StructArray``. Common pyarrow sources you can use are:
 Exporting from DataFusion
 -------------------------
 
-DataFusion DataFrames implement ``__arrow_c_stream__`` PyCapsule interface, so any
-Python library that accepts these can import a DataFusion DataFrame directly.
+DataFusion DataFrames implement ``__arrow_c_stream__`` so any Python library
+that accepts this interface can import a DataFusion ``DataFrame`` directly.
 
-.. warning::
-    It is important to note that this will cause the DataFrame execution to happen, which may be
-    a time consuming task. That is, you will cause a
-    :py:func:`datafusion.dataframe.DataFrame.collect` operation call to occur.
+``collect()`` or ``pa.table(df)`` will materialize every record batch in
+Python. For large results this can quickly exhaust memory. Instead, stream the
+output incrementally:
 
+.. ipython:: python
+
+    # Stream batches with DataFusion's native API
+    stream = df.execute_stream()
+    for batch in stream:
+        ...  # process each RecordBatch as it arrives
+
+.. ipython:: python
+
+    # Expose a C stream that PyArrow can consume lazily
+    import pyarrow as pa
+    reader = pa.ipc.RecordBatchStreamReader._import_from_c(df.__arrow_c_stream__())
+    for batch in reader:
+        ...  # process each batch without buffering the entire table
+
+If the goal is simply to persist results, prefer engine-level writers such as
+``df.write_parquet()``. These writers stream data from Rust directly to the
+destination and avoid Python-side memory growth.
 
 .. ipython:: python
 
     df = df.select((col("a") * lit(1.5)).alias("c"), lit("df").alias("d"))
-    pa.table(df)
+    pa.table(df)  # loads all batches into memory
 
@@ -161,7 +161,13 @@ def with_batch_size(self, batch_size: int) -> SessionConfig:
     def with_target_partitions(self, target_partitions: int) -> SessionConfig:
         """Customize the number of target partitions for query execution.
 
-        Increasing partitions can increase concurrency.
+        Each partition is processed on its own thread, so this value controls
+        the degree of parallelism. A good starting point is the number of
+        logical CPU cores on your machine, for example
+        ``SessionConfig().with_target_partitions(os.cpu_count())``.
+
+        See the :ref:`configuration guide <target_partitions>` for more
+        discussion on choosing a value.
 
         Args:
             target_partitions: Number of target partitions.
 
@@ -20,6 +20,7 @@
 import re
 import threading
 import time
+import tracemalloc
 from typing import Any
 
 import pyarrow as pa
@@ -252,13 +253,6 @@ def test_filter(df):
     assert result.column(2) == pa.array([5])
 
 
-def test_show_empty(df, capsys):
-    df_empty = df.filter(column("a") > literal(3))
-    df_empty.show()
-    captured = capsys.readouterr()
-    assert "DataFrame has no rows" in captured.out
-
-
 def test_sort(df):
     df = df.sort(column("b").sort(ascending=False))
 
@@ -1390,6 +1384,27 @@ def test_collect_partitioned():
     assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
 
 
+def test_collect_multiple_batches_to_pyarrow():
+    ctx = SessionContext()
+
+    batch1 = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2])],
+        names=["a"],
+    )
+    batch2 = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4])],
+        names=["a"],
+    )
+
+    df = ctx.create_dataframe([[batch1], [batch2]])
+
+    batches = df.collect()
+
+    assert len(batches) == 2
+    table = pa.Table.from_batches(batches)
+    assert table.column("a").to_pylist() == [1, 2, 3, 4]
+
+
 def test_union(ctx):
     batch = pa.RecordBatch.from_arrays(
         [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
@@ -1470,6 +1485,24 @@ def test_empty_to_pandas(df):
     assert set(pandas_df.columns) == {"a", "b", "c"}
 
 
+def test_show_no_batches(capsys):
+    """Ensure showing a query with no batches still prints headers."""
+    ctx = SessionContext()
+    df = ctx.sql("SELECT 1 AS a WHERE 1=0")
+    df.show()
+    captured = capsys.readouterr()
+    assert "| a |" in captured.out
+    assert "Empty DataFrame" not in captured.out
+
+
+def test_show_empty_dataframe(df, capsys):
+    """Ensure showing an empty DataFrame prints a helpful message."""
+    empty_df = df.limit(0)
+    empty_df.show()
+    captured = capsys.readouterr()
+    assert "Empty DataFrame" in captured.out
+
+
 def test_to_polars(df):
     # Skip test if polars is not installed
     pl = pytest.importorskip("polars")
@@ -1574,6 +1607,23 @@ async def test_execute_stream_partitioned_async(df):
         assert not remaining_batches
 
 
+def test_arrow_c_stream_streaming(large_df):
+    df = large_df.repartition(4)
+    capsule = df.__arrow_c_stream__()
+    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+    ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]
+    ptr = ctypes.pythonapi.PyCapsule_GetPointer(capsule, b"arrow_array_stream")
+    reader = pa.RecordBatchReader._import_from_c(ptr)
+
+    tracemalloc.start()
+    batch_count = sum(1 for _ in reader)
+    _current, peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+
+    assert batch_count > 1
+    assert peak < 50 * MB
+
+
 def test_empty_to_arrow_table(df):
     # Convert empty datafusion dataframe to pyarrow Table
     pyarrow_table = df.limit(0).to_arrow_table()
@@ -2664,19 +2714,3 @@ def trigger_interrupt():
 
     # Make sure the interrupt thread has finished
     interrupt_thread.join(timeout=1.0)
-
-
-def test_show_select_where_no_rows(capsys) -> None:
-    ctx = SessionContext()
-    df = ctx.sql("SELECT 1 WHERE 1=0")
-    df.show()
-    out = capsys.readouterr().out
-    assert "DataFrame has no rows" in out
-
-
-def test_show_from_empty_batch(capsys) -> None:
-    ctx = SessionContext()
-    batch = pa.record_batch([pa.array([], type=pa.int32())], names=["a"])
-    ctx.create_dataframe([[batch]]).show()
-    out = capsys.readouterr().out
-    assert "| a |" in out
 
@@ -0,0 +1,18 @@
+import pytest
+
+
+def test_record_batch_stream_next(ctx):
+    stream = ctx.sql("SELECT 1 as a").execute_stream()
+    batch = next(stream)
+    assert batch.to_pyarrow().num_rows == 1
+    with pytest.raises(StopIteration):
+        next(stream)
+
+
+@pytest.mark.asyncio
+async def test_record_batch_stream_anext(ctx):
+    stream = ctx.sql("SELECT 1 as a").execute_stream()
+    batch = await stream.__anext__()
+    assert batch.to_pyarrow().num_rows == 1
+    with pytest.raises(StopAsyncIteration):
+        await stream.__anext__()