kosiew
diff --git a/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 1 addition & 30 deletions b/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 1 addition & 30 deletions
diff --git a/‎python/datafusion/dataframe.py‎
Lines changed: 6 additions & 31 deletions b/‎python/datafusion/dataframe.py‎
Lines changed: 6 additions & 31 deletions
diff --git a/‎python/tests/conftest.py‎
Lines changed: 1 addition & 10 deletions b/‎python/tests/conftest.py‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎python/tests/test_dataframe.py‎
Lines changed: 0 additions & 220 deletions b/‎python/tests/test_dataframe.py‎
Lines changed: 0 additions & 220 deletions
@@ -145,39 +145,10 @@ To materialize the results of your DataFrame operations:
     
     # Display results
     df.show()                         # Print tabular format to console
-
+    
     # Count rows
     count = df.count()
 
-PyArrow Streaming
------------------
-
-DataFusion DataFrames implement the ``__arrow_c_stream__`` protocol, enabling
-zero-copy streaming into libraries like `PyArrow <https://arrow.apache.org/>`_.
-Earlier versions eagerly converted the entire DataFrame when exporting to
-PyArrow, which could exhaust memory on large datasets. With streaming, batches
-are produced lazily so you can process arbitrarily large results without
-out-of-memory errors.
-
-.. code-block:: python
-
-    import pyarrow as pa
-
-    # Create a PyArrow RecordBatchReader without materializing all batches
-    reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())
-    for batch in reader:
-        ...  # process each batch as it is produced
-
-DataFrames are also iterable, yielding :class:`pyarrow.RecordBatch` objects
-lazily so you can loop over results directly:
-
-.. code-block:: python
-
-    for batch in df:
-        ...  # process each batch as it is produced
-
-See :doc:`../io/arrow` for additional details on the Arrow interface.
-
 HTML Rendering
 --------------
 
 
@@ -26,7 +26,6 @@
     TYPE_CHECKING,
     Any,
     Iterable,
-    Iterator,
     Literal,
     Optional,
     Union,
@@ -290,9 +289,6 @@ def __init__(
 class DataFrame:
     """Two dimensional table representation of data.
 
-    DataFrame objects are iterable; iterating over a DataFrame yields
-    :class:`pyarrow.RecordBatch` instances lazily.
-
     See :ref:`user_guide_concepts` in the online documentation for more information.
     """
 
@@ -1102,42 +1098,21 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram
         return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls))
 
     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
-        """Export the DataFrame as an Arrow C Stream.
+        """Export an Arrow PyCapsule Stream.
 
-        The DataFrame is executed using DataFusion's streaming APIs and exposed via
-        Arrow's C Stream interface. Record batches are produced incrementally, so the
-        full result set is never materialized in memory. When ``requested_schema`` is
-        provided, only straightforward projections such as column selection or
-        reordering are applied.
+        This will execute and collect the DataFrame. We will attempt to respect the
+        requested schema, but only trivial transformations will be applied such as only
+        returning the fields listed in the requested schema if their data types match
+        those in the DataFrame.
 
         Args:
             requested_schema: Attempt to provide the DataFrame using this schema.
 
         Returns:
-            Arrow PyCapsule object representing an ``ArrowArrayStream``.
+            Arrow PyCapsule object.
         """
-        # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages
-        # ``execute_stream_partitioned`` under the hood to stream batches while
-        # preserving the original partition order.
         return self.df.__arrow_c_stream__(requested_schema)
 
-    def __iter__(self) -> Iterator[pa.RecordBatch]:
-        """Yield record batches from the DataFrame without materializing results.
-
-        This implementation streams record batches via the Arrow C Stream
-        interface, allowing callers such as :func:`pyarrow.Table.from_batches` to
-        consume results lazily. The DataFrame is executed using DataFusion's
-        partitioned streaming APIs so ``collect`` is never invoked and batch
-        order across partitions is preserved.
-        """
-        from contextlib import closing
-
-        import pyarrow as pa
-
-        reader = pa.RecordBatchReader._import_from_c_capsule(self.__arrow_c_stream__())
-        with closing(reader):
-            yield from reader
-
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
         """Apply a function to the current DataFrame which returns another DataFrame.
 
 
@@ -17,7 +17,7 @@
 
 import pyarrow as pa
 import pytest
-from datafusion import DataFrame, SessionContext
+from datafusion import SessionContext
 from pyarrow.csv import write_csv
 
 
@@ -49,12 +49,3 @@ def database(ctx, tmp_path):
         delimiter=",",
         schema_infer_max_records=10,
     )
-
-
-@pytest.fixture
-def fail_collect(monkeypatch):
-    def _fail_collect(self, *args, **kwargs):  # pragma: no cover - failure path
-        msg = "collect should not be called"
-        raise AssertionError(msg)
-
-    monkeypatch.setattr(DataFrame, "collect", _fail_collect)
@@ -46,8 +46,6 @@
 from datafusion.expr import Window
 from pyarrow.csv import write_csv
 
-pa_cffi = pytest.importorskip("pyarrow.cffi")
-
 MB = 1024 * 1024
 
 
@@ -1584,120 +1582,6 @@ def test_empty_to_arrow_table(df):
     assert set(pyarrow_table.column_names) == {"a", "b", "c"}
 
 
-def test_iter_batches_dataframe(fail_collect):
-    ctx = SessionContext()
-
-    batch1 = pa.record_batch([pa.array([1])], names=["a"])
-    batch2 = pa.record_batch([pa.array([2])], names=["a"])
-    df = ctx.create_dataframe([[batch1], [batch2]])
-
-    expected = [batch1, batch2]
-    for got, exp in zip(df, expected):
-        assert got.equals(exp)
-
-
-def test_arrow_c_stream_to_table_and_reader(fail_collect):
-    ctx = SessionContext()
-
-    # Create a DataFrame with two separate record batches
-    batch1 = pa.record_batch([pa.array([1])], names=["a"])
-    batch2 = pa.record_batch([pa.array([2])], names=["a"])
-    df = ctx.create_dataframe([[batch1], [batch2]])
-
-    table = pa.Table.from_batches(df)
-    batches = table.to_batches()
-
-    assert len(batches) == 2
-    assert batches[0].equals(batch1)
-    assert batches[1].equals(batch2)
-    assert table.schema == df.schema()
-    assert table.column("a").num_chunks == 2
-
-    reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())
-    assert isinstance(reader, pa.RecordBatchReader)
-    reader_table = pa.Table.from_batches(reader)
-    expected = pa.Table.from_batches([batch1, batch2])
-    assert reader_table.equals(expected)
-
-
-def test_arrow_c_stream_order():
-    ctx = SessionContext()
-
-    batch1 = pa.record_batch([pa.array([1])], names=["a"])
-    batch2 = pa.record_batch([pa.array([2])], names=["a"])
-
-    df = ctx.create_dataframe([[batch1, batch2]])
-
-    table = pa.Table.from_batches(df)
-    expected = pa.Table.from_batches([batch1, batch2])
-
-    assert table.equals(expected)
-    col = table.column("a")
-    assert col.chunk(0)[0].as_py() == 1
-    assert col.chunk(1)[0].as_py() == 2
-
-
-def test_arrow_c_stream_schema_selection(fail_collect):
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [
-            pa.array([1, 2]),
-            pa.array([3, 4]),
-            pa.array([5, 6]),
-        ],
-        names=["a", "b", "c"],
-    )
-    df = ctx.create_dataframe([[batch]])
-
-    requested_schema = pa.schema([("c", pa.int64()), ("a", pa.int64())])
-
-    c_schema = pa_cffi.ffi.new("struct ArrowSchema*")
-    address = int(pa_cffi.ffi.cast("uintptr_t", c_schema))
-    requested_schema._export_to_c(address)
-    capsule_new = ctypes.pythonapi.PyCapsule_New
-    capsule_new.restype = ctypes.py_object
-    capsule_new.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
-    schema_capsule = capsule_new(ctypes.c_void_p(address), b"arrow_schema", None)
-
-    reader = pa.RecordBatchReader._import_from_c_capsule(
-        df.__arrow_c_stream__(schema_capsule)
-    )
-
-    assert reader.schema == requested_schema
-
-    batches = list(reader)
-
-    assert len(batches) == 1
-    expected_batch = pa.record_batch(
-        [pa.array([5, 6]), pa.array([1, 2])], names=["c", "a"]
-    )
-    assert batches[0].equals(expected_batch)
-
-
-def test_arrow_c_stream_schema_mismatch(fail_collect):
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2]), pa.array([3, 4])], names=["a", "b"]
-    )
-    df = ctx.create_dataframe([[batch]])
-
-    bad_schema = pa.schema([("a", pa.string())])
-
-    c_schema = pa_cffi.ffi.new("struct ArrowSchema*")
-    address = int(pa_cffi.ffi.cast("uintptr_t", c_schema))
-    bad_schema._export_to_c(address)
-
-    capsule_new = ctypes.pythonapi.PyCapsule_New
-    capsule_new.restype = ctypes.py_object
-    capsule_new.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p]
-    bad_capsule = capsule_new(ctypes.c_void_p(address), b"arrow_schema", None)
-
-    with pytest.raises(Exception, match="Fail to merge schema"):
-        df.__arrow_c_stream__(bad_capsule)
-
-
 def test_to_pylist(df):
     # Convert datafusion dataframe to Python list
     pylist = df.to_pylist()
@@ -2782,110 +2666,6 @@ def trigger_interrupt():
     interrupt_thread.join(timeout=1.0)
 
 
-def test_arrow_c_stream_interrupted():
-    """__arrow_c_stream__ responds to ``KeyboardInterrupt`` signals.
-
-    Similar to ``test_collect_interrupted`` this test issues a long running
-    query, but consumes the results via ``__arrow_c_stream__``. It then raises
-    ``KeyboardInterrupt`` in the main thread and verifies that the stream
-    iteration stops promptly with the appropriate exception.
-    """
-
-    ctx = SessionContext()
-
-    batches = []
-    for i in range(10):
-        batch = pa.RecordBatch.from_arrays(
-            [
-                pa.array(list(range(i * 1000, (i + 1) * 1000))),
-                pa.array([f"value_{j}" for j in range(i * 1000, (i + 1) * 1000)]),
-            ],
-            names=["a", "b"],
-        )
-        batches.append(batch)
-
-    ctx.register_record_batches("t1", [batches])
-    ctx.register_record_batches("t2", [batches])
-
-    df = ctx.sql(
-        """
-        WITH t1_expanded AS (
-            SELECT
-                a,
-                b,
-                CAST(a AS DOUBLE) / 1.5 AS c,
-                CAST(a AS DOUBLE) * CAST(a AS DOUBLE) AS d
-            FROM t1
-            CROSS JOIN (SELECT 1 AS dummy FROM t1 LIMIT 5)
-        ),
-        t2_expanded AS (
-            SELECT
-                a,
-                b,
-                CAST(a AS DOUBLE) * 2.5 AS e,
-                CAST(a AS DOUBLE) * CAST(a AS DOUBLE) * CAST(a AS DOUBLE) AS f
-            FROM t2
-            CROSS JOIN (SELECT 1 AS dummy FROM t2 LIMIT 5)
-        )
-        SELECT
-            t1.a, t1.b, t1.c, t1.d,
-            t2.a AS a2, t2.b AS b2, t2.e, t2.f
-        FROM t1_expanded t1
-        JOIN t2_expanded t2 ON t1.a % 100 = t2.a % 100
-        WHERE t1.a > 100 AND t2.a > 100
-        """
-    )
-
-    reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())
-
-    interrupted = False
-    interrupt_error = None
-    query_started = threading.Event()
-    max_wait_time = 5.0
-
-    def trigger_interrupt():
-        start_time = time.time()
-        while not query_started.is_set():
-            time.sleep(0.1)
-            if time.time() - start_time > max_wait_time:
-                msg = f"Query did not start within {max_wait_time} seconds"
-                raise RuntimeError(msg)
-
-        thread_id = threading.main_thread().ident
-        if thread_id is None:
-            msg = "Cannot get main thread ID"
-            raise RuntimeError(msg)
-
-        exception = ctypes.py_object(KeyboardInterrupt)
-        res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
-            ctypes.c_long(thread_id), exception
-        )
-        if res != 1:
-            ctypes.pythonapi.PyThreadState_SetAsyncExc(
-                ctypes.c_long(thread_id), ctypes.py_object(0)
-            )
-            msg = "Failed to raise KeyboardInterrupt in main thread"
-            raise RuntimeError(msg)
-
-    interrupt_thread = threading.Thread(target=trigger_interrupt)
-    interrupt_thread.daemon = True
-    interrupt_thread.start()
-
-    try:
-        query_started.set()
-        # consume the reader which should block and be interrupted
-        reader.read_all()
-    except KeyboardInterrupt:
-        interrupted = True
-    except Exception as e:  # pragma: no cover - unexpected errors
-        interrupt_error = e
-
-    if not interrupted:
-        pytest.fail(f"Stream was not interrupted; got error: {interrupt_error}")
-
-    interrupt_thread.join(timeout=1.0)
-
-
 def test_show_select_where_no_rows(capsys) -> None:
     ctx = SessionContext()
     df = ctx.sql("SELECT 1 WHERE 1=0")