Add range method to SessionContext and iterator support to DataFrame for improved data handling

kosiew · kosiew · commit 73e1b06d5ed4 · 2025-09-01T12:11:33.000+08:00
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -731,6 +731,37 @@ def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame:
         """
         return DataFrame(self.ctx.from_polars(data, name))
 
+    def range(
+        self,
+        start: int,
+        stop: int | None = None,
+        step: int = 1,
+        partitions: int | None = None,
+    ) -> DataFrame:
+        """Create a DataFrame containing a sequence of numbers.
+
+        This is backed by DataFusion's ``range`` table function, which generates
+        values lazily and therefore does not materialize the full range in
+        memory. When ``stop`` is omitted, ``start`` is treated as the stop value
+        and the sequence begins at zero.
+
+        Args:
+            start: Starting value for the sequence or the exclusive stop if
+                ``stop`` is ``None``.
+            stop: Exclusive upper bound of the sequence.
+            step: Increment between successive values.
+            partitions: Optional number of partitions for the generated data.
+
+        Returns:
+            DataFrame yielding the requested range of values.
+        """
+        if stop is None:
+            start, stop = 0, start
+
+        parts = f", {int(partitions)}" if partitions is not None else ""
+        sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})"  # noqa: S608
+        return self.sql(sql)
+
     # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
     # is the discussion on how we arrived at adding register_view
     def register_view(self, name: str, df: DataFrame) -> None:
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -26,6 +26,7 @@
     TYPE_CHECKING,
     Any,
     Iterable,
+    Iterator,
     Literal,
     Optional,
     Union,
@@ -1116,6 +1117,19 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         # ``execute_stream`` under the hood to stream batches one at a time.
         return self.df.__arrow_c_stream__(requested_schema)
 
+    def __iter__(self) -> Iterator[pa.RecordBatch]:
+        """Yield record batches from the DataFrame without materializing results.
+
+        This implementation streams record batches via the Arrow C Stream
+        interface, allowing callers such as :func:`pyarrow.Table.from_batches` to
+        consume results lazily. The DataFrame is executed using DataFusion's
+        streaming APIs so ``collect`` is never invoked.
+        """
+        import pyarrow as pa
+
+        reader = pa.RecordBatchReader._import_from_c(self.__arrow_c_stream__())
+        yield from reader
+
     def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame:
         """Apply a function to the current DataFrame which returns another DataFrame.