|
25 | 25 | from typing import ( |
26 | 26 | TYPE_CHECKING, |
27 | 27 | Any, |
28 | | - AsyncIterator, |
29 | 28 | Iterable, |
30 | | - Iterator, |
31 | 29 | Literal, |
32 | 30 | Optional, |
33 | 31 | Union, |
|
44 | 42 | from datafusion._internal import ParquetWriterOptions as ParquetWriterOptionsInternal |
45 | 43 | from datafusion.expr import Expr, SortExpr, sort_or_default |
46 | 44 | from datafusion.plan import ExecutionPlan, LogicalPlan |
47 | | -from datafusion.record_batch import ( |
48 | | - RecordBatch, |
49 | | - RecordBatchStream, |
50 | | - to_record_batch_stream, |
51 | | -) |
| 45 | +from datafusion.record_batch import RecordBatchStream |
52 | 46 |
|
53 | 47 | if TYPE_CHECKING: |
54 | 48 | import pathlib |
|
59 | 53 | import pyarrow as pa |
60 | 54 |
|
61 | 55 | from datafusion._internal import expr as expr_internal |
62 | | - from datafusion.record_batch import RecordBatch |
63 | 56 |
|
64 | 57 | from enum import Enum |
65 | 58 |
|
@@ -296,9 +289,6 @@ def __init__( |
296 | 289 | class DataFrame: |
297 | 290 | """Two dimensional table representation of data. |
298 | 291 |
|
299 | | - DataFrame objects are iterable; iterating over a DataFrame yields |
300 | | - :class:`pyarrow.RecordBatch` instances lazily. |
301 | | -
|
302 | 292 | See :ref:`user_guide_concepts` in the online documentation for more information. |
303 | 293 | """ |
304 | 294 |
|
@@ -1108,42 +1098,21 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram |
1108 | 1098 | return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls)) |
1109 | 1099 |
|
1110 | 1100 | def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: |
1111 | | - """Export the DataFrame as an Arrow C Stream. |
| 1101 | + """Export an Arrow PyCapsule Stream. |
1112 | 1102 |
|
1113 | | - The DataFrame is executed using DataFusion's streaming APIs and exposed via |
1114 | | - Arrow's C Stream interface. Record batches are produced incrementally, so the |
1115 | | - full result set is never materialized in memory. When ``requested_schema`` is |
1116 | | - provided, only straightforward projections such as column selection or |
1117 | | - reordering are applied. |
| 1103 | + This will execute and collect the DataFrame. We will attempt to respect the |
| 1104 | + requested schema, but only trivial transformations will be applied such as only |
| 1105 | + returning the fields listed in the requested schema if their data types match |
| 1106 | + those in the DataFrame. |
1118 | 1107 |
|
1119 | 1108 | Args: |
1120 | 1109 | requested_schema: Attempt to provide the DataFrame using this schema. |
1121 | 1110 |
|
1122 | 1111 | Returns: |
1123 | | - Arrow PyCapsule object representing an ``ArrowArrayStream``. |
| 1112 | + Arrow PyCapsule object. |
1124 | 1113 | """ |
1125 | | - # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages |
1126 | | - # ``execute_stream_partitioned`` under the hood to stream batches while |
1127 | | - # preserving the original partition order. |
1128 | 1114 | return self.df.__arrow_c_stream__(requested_schema) |
1129 | 1115 |
|
1130 | | - def __iter__(self) -> Iterator[pa.RecordBatch]: |
1131 | | - """Iterate over :class:`pyarrow.RecordBatch` objects. |
1132 | | -
|
1133 | | - Results are streamed without materializing the full DataFrame. This |
1134 | | - implementation delegates to :func:`to_record_batch_stream`, which executes |
1135 | | - the :class:`DataFrame` and returns a :class:`RecordBatchStream`. |
1136 | | - """ |
1137 | | - return to_record_batch_stream(self).__iter__() |
1138 | | - |
1139 | | - def __aiter__(self) -> AsyncIterator[RecordBatch]: |
1140 | | - """Asynchronously yield record batches from the DataFrame. |
1141 | | -
|
1142 | | - This delegates to :func:`to_record_batch_stream` to obtain a |
1143 | | - :class:`RecordBatchStream` and returns its asynchronous iterator. |
1144 | | - """ |
1145 | | - return to_record_batch_stream(self).__aiter__() |
1146 | | - |
1147 | 1116 | def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame: |
1148 | 1117 | """Apply a function to the current DataFrame which returns another DataFrame. |
1149 | 1118 |
|
|
0 commit comments