|
25 | 25 | from typing import ( |
26 | 26 | TYPE_CHECKING, |
27 | 27 | Any, |
| 28 | + AsyncIterator, |
28 | 29 | Iterable, |
| 30 | + Iterator, |
29 | 31 | Literal, |
30 | 32 | Optional, |
31 | 33 | Union, |
|
42 | 44 | from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal |
43 | 45 | from datafusion.expr import Expr, SortExpr, sort_or_default |
44 | 46 | from datafusion.plan import ExecutionPlan, LogicalPlan |
45 | | -from datafusion.record_batch import RecordBatchStream |
| 47 | +from datafusion.record_batch import ( |
| 48 | + RecordBatch, |
| 49 | + RecordBatchStream, |
| 50 | + to_record_batch_stream, |
| 51 | +) |
46 | 52 |
|
47 | 53 | if TYPE_CHECKING: |
48 | 54 | import pathlib |
|
53 | 59 | import pyarrow as pa |
54 | 60 |
|
55 | 61 | from datafusion._internal import expr as expr_internal |
| 62 | + from datafusion.record_batch import RecordBatch |
56 | 63 |
|
57 | 64 | from enum import Enum |
58 | 65 |
|
@@ -289,6 +296,9 @@ def __init__( |
289 | 296 | class DataFrame: |
290 | 297 | """Two dimensional table representation of data. |
291 | 298 |
|
| 299 | + DataFrame objects are iterable; iterating over a DataFrame yields |
| 300 | + :class:`pyarrow.RecordBatch` instances lazily. |
| 301 | +
|
292 | 302 | See :ref:`user_guide_concepts` in the online documentation for more information. |
293 | 303 | """ |
294 | 304 |
|
@@ -1098,21 +1108,47 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram |
1098 | 1108 | return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls)) |
1099 | 1109 |
|
1100 | 1110 | def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: |
1101 | | - """Export an Arrow PyCapsule Stream. |
| 1111 | + """Export the DataFrame as an Arrow C Stream. |
| 1112 | +
|
| 1113 | + The DataFrame is executed using DataFusion's streaming APIs and exposed via |
| 1114 | + Arrow's C Stream interface. Record batches are produced incrementally, so the |
| 1115 | + full result set is never materialized in memory. When ``requested_schema`` is |
| 1116 | + provided, only straightforward projections such as column selection or |
| 1117 | + reordering are applied. |
1102 | 1118 |
|
1103 | | - This will execute and collect the DataFrame. We will attempt to respect the |
1104 | | - requested schema, but only trivial transformations will be applied such as only |
1105 | | - returning the fields listed in the requested schema if their data types match |
1106 | | - those in the DataFrame. |
| 1119 | + The returned capsule holds a reference to the originating |
| 1120 | + :class:`SessionContext`, keeping it alive until the stream is fully |
| 1121 | + consumed. This makes it safe to drop the original context after obtaining |
| 1122 | + the stream. |
1107 | 1123 |
|
1108 | 1124 | Args: |
1109 | 1125 | requested_schema: Attempt to provide the DataFrame using this schema. |
1110 | 1126 |
|
1111 | 1127 | Returns: |
1112 | | - Arrow PyCapsule object. |
| 1128 | + Arrow PyCapsule object representing an ``ArrowArrayStream``. |
1113 | 1129 | """ |
| 1130 | + # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages |
| 1131 | + # ``execute_stream_partitioned`` under the hood to stream batches while |
| 1132 | + # preserving the original partition order. |
1114 | 1133 | return self.df.__arrow_c_stream__(requested_schema) |
1115 | 1134 |
|
| 1135 | + def __iter__(self) -> Iterator[pa.RecordBatch]: |
| 1136 | + """Iterate over :class:`pyarrow.RecordBatch` objects. |
| 1137 | +
|
| 1138 | + Results are streamed without materializing the full DataFrame. This |
| 1139 | + implementation delegates to :func:`to_record_batch_stream`, which executes |
| 1140 | + the :class:`DataFrame` and returns a :class:`RecordBatchStream`. |
| 1141 | + """ |
| 1142 | + return to_record_batch_stream(self).__iter__() |
| 1143 | + |
| 1144 | + def __aiter__(self) -> AsyncIterator[RecordBatch]: |
| 1145 | + """Asynchronously yield record batches from the DataFrame. |
| 1146 | +
|
| 1147 | + This delegates to :func:`to_record_batch_stream` to obtain a |
| 1148 | + :class:`RecordBatchStream` and returns its asynchronous iterator. |
| 1149 | + """ |
| 1150 | + return to_record_batch_stream(self).__aiter__() |
| 1151 | + |
1116 | 1152 | def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame: |
1117 | 1153 | """Apply a function to the current DataFrame which returns another DataFrame. |
1118 | 1154 |
|
|
0 commit comments