kosiew
diff --git a/‎README.md‎
Lines changed: 7 additions & 0 deletions b/‎README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/mdbook/src/installation.md‎
Lines changed: 11 additions & 0 deletions b/‎docs/mdbook/src/installation.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎docs/source/conf.py‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/conf.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/user-guide/common-operations/functions.rst‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/user-guide/common-operations/functions.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/user-guide/data-sources.rst‎
Lines changed: 6 additions & 1 deletion b/‎docs/source/user-guide/data-sources.rst‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 40 additions & 1 deletion b/‎docs/source/user-guide/dataframe/index.rst‎
Lines changed: 40 additions & 1 deletion
diff --git a/‎docs/source/user-guide/introduction.rst‎
Lines changed: 6 additions & 1 deletion b/‎docs/source/user-guide/introduction.rst‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 5 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎python/datafusion/catalog.py‎
Lines changed: 2 additions & 2 deletions b/‎python/datafusion/catalog.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/datafusion/context.py‎
Lines changed: 24 additions & 17 deletions b/‎python/datafusion/context.py‎
Lines changed: 24 additions & 17 deletions
@@ -187,16 +187,23 @@ See [examples](examples/README.md) for more information.
 
 ## How to install
 
+DataFusion works with any library exposing the Arrow PyCapsule interface. If you
+need `pyarrow`, install the optional extra.
+
 ### uv
 
 ```bash
 uv add datafusion
+# or with PyArrow support
+uv add "datafusion[pyarrow]"
 ```
 
 ### Pip
 
 ```bash
 pip install datafusion
+# or with PyArrow support
+pip install "datafusion[pyarrow]"
 # or
 python -m pip install datafusion
 ```
 
@@ -18,6 +18,13 @@
 
 DataFusion is easy to install, just like any other Python library.
 
+DataFusion works with any library exposing the Arrow PyCapsule interface. If
+you rely on `pyarrow`, install the optional extra:
+
+```bash
+uv pip install "datafusion[pyarrow]"
+```
+
 ## Using uv
 
 If you do not yet have a virtual environment, create one:
@@ -36,12 +43,16 @@ Or, to add to a project:
 
 ```bash
 uv add datafusion
+# or with PyArrow support
+uv add "datafusion[pyarrow]"
 ```
 
 ## Using pip
 
 ``` bash
 pip install datafusion
+# or with PyArrow support
+pip install "datafusion[pyarrow]"
 ```
 
 ## uv & JupyterLab setup
 
@@ -72,6 +72,10 @@
 suppress_warnings = ["autoapi.python_import_resolution"]
 autoapi_python_class_content = "both"
 autoapi_keep_files = False  # set to True for debugging generated files
+autoapi_options = ["members", "undoc-members", "special-members"]
+autoapi_member_options = {
+    "special-members": "__iter__,__aiter__,__arrow_c_array__,__arrow_c_stream__"
+}
 
 
 def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool:  # noqa: ARG001
 
@@ -109,6 +109,10 @@ Casting
 
 Casting expressions to different data types using :py:func:`~datafusion.functions.arrow_cast`
 
+DataFusion's :class:`~datafusion.types.DataType` can be constructed from any
+object implementing ``__arrow_c_schema__`` and passed to ``arrow_cast`` without
+requiring :mod:`pyarrow`.
+
 .. ipython:: python
 
     df.select(
 
@@ -158,7 +158,12 @@ as Delta Lake. This will require a recent version of
     df = ctx.table("my_delta_table")
     df.show()
 
-On older versions of ``deltalake`` (prior to 0.22) you can use the 
+Any Python object that implements the
+``__arrow_c_stream__`` protocol can be registered with
+``register_dataset``. This includes scanners from libraries such as
+``nanoarrow``, ``Polars``, or ``DuckDB``.
+
+On older versions of ``deltalake`` (prior to 0.22) you can use the
 `Arrow DataSet <https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html>`_
 interface to import to DataFusion, but this does not support features such as filter push down
 which can lead to a significant performance difference.
 
@@ -145,10 +145,49 @@ To materialize the results of your DataFrame operations:
     
     # Display results
     df.show()                         # Print tabular format to console
-    
+
     # Count rows
     count = df.count()
 
+PyArrow Streaming
+-----------------
+
+DataFusion DataFrames implement the ``__arrow_c_stream__`` protocol, enabling
+zero-copy streaming into libraries like `PyArrow <https://arrow.apache.org/>`_.
+Earlier versions eagerly converted the entire DataFrame when exporting to
+PyArrow, which could exhaust memory on large datasets. With streaming, batches
+are produced lazily so you can process arbitrarily large results without
+out-of-memory errors.
+
+.. code-block:: python
+
+    import pyarrow as pa
+
+    # Create a PyArrow RecordBatchReader without materializing all batches
+    reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())
+    for batch in reader:
+        ...  # process each batch as it is produced
+
+DataFrames expose :py:meth:`~datafusion.DataFrame.to_stream`, which returns a
+``RecordBatchStream`` for lazily processing results without materializing them
+all at once:
+
+.. code-block:: python
+
+    stream = df.to_stream()
+    for batch in stream:
+        ...  # process each batch as it is produced
+
+DataFrames themselves are also iterable and delegate to ``to_stream()`` under
+the hood:
+
+.. code-block:: python
+
+    for batch in df:
+        ...  # process each batch as it is produced
+
+See :doc:`../io/arrow` for additional details on the Arrow interface.
+
 HTML Rendering
 --------------
 
 
@@ -26,11 +26,16 @@ DataFusion through various examples and highlight the most effective ways of usi
 Installation
 ------------
 
-DataFusion is a Python library and, as such, can be installed via pip from `PyPI <https://pypi.org/project/datafusion>`__.
+DataFusion is a Python library and, as such, can be installed via pip from
+`PyPI <https://pypi.org/project/datafusion>`__. DataFusion works with any
+library exposing the Arrow PyCapsule interface. If you need ``pyarrow``,
+install the optional extra.
 
 .. code-block:: shell
 
     pip install datafusion
+    # or with PyArrow support
+    pip install "datafusion[pyarrow]"
 
 You can verify the installation by running:
 
 
@@ -38,19 +38,22 @@ classifiers = [
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
+"Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
     "Programming Language :: Python",
     "Programming Language :: Rust",
 ]
-dependencies = ["pyarrow>=11.0.0", "typing-extensions;python_version<'3.13'"]
+dependencies = ["typing-extensions;python_version<'3.13'"]
 dynamic = ["version"]
 
 [project.urls]
 homepage = "https://datafusion.apache.org/python"
 documentation = "https://datafusion.apache.org/python"
 repository = "https://github.com/apache/datafusion-python"
 
+[project.optional-dependencies]
+pyarrow = ["pyarrow>=11.0.0"]
+
 [tool.isort]
 profile = "black"
 
 
@@ -150,8 +150,8 @@ def __repr__(self) -> str:
         return self.table.__repr__()
 
     @staticmethod
-    def from_dataset(dataset: pa.dataset.Dataset) -> Table:
-        """Turn a pyarrow Dataset into a Table."""
+    def from_dataset(dataset: object) -> Table:
+        """Turn any ``__arrow_c_stream__`` source into a Table."""
         return Table(df_internal.catalog.RawTable.from_dataset(dataset))
 
     @property
 
@@ -22,7 +22,8 @@
 import warnings
 from typing import TYPE_CHECKING, Any, Protocol
 
-import pyarrow as pa
+from datafusion.common import DataTypeMap
+from datafusion.types import ensure_pyarrow_type
 
 try:
     from warnings import deprecated  # Python 3.13+
@@ -45,6 +46,7 @@
 
     import pandas as pd
     import polars as pl
+    import pyarrow as pa
 
     from datafusion.plan import ExecutionPlan, LogicalPlan
 
@@ -550,7 +552,7 @@ def register_listing_table(
         self,
         name: str,
         path: str | pathlib.Path,
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         file_extension: str = ".parquet",
         schema: pa.Schema | None = None,
         file_sort_order: list[list[Expr | SortExpr]] | None = None,
@@ -803,7 +805,7 @@ def register_parquet(
         self,
         name: str,
         path: str | pathlib.Path,
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         parquet_pruning: bool = True,
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
@@ -895,7 +897,7 @@ def register_json(
         schema: pa.Schema | None = None,
         schema_infer_max_records: int = 1000,
         file_extension: str = ".json",
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         file_compression_type: str | None = None,
     ) -> None:
         """Register a JSON file as a table.
@@ -933,7 +935,7 @@ def register_avro(
         path: str | pathlib.Path,
         schema: pa.Schema | None = None,
         file_extension: str = ".avro",
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
     ) -> None:
         """Register an Avro file as a table.
 
@@ -954,12 +956,16 @@ def register_avro(
             name, str(path), schema, file_extension, table_partition_cols
         )
 
-    def register_dataset(self, name: str, dataset: pa.dataset.Dataset) -> None:
-        """Register a :py:class:`pa.dataset.Dataset` as a table.
+    def register_dataset(self, name: str, dataset: object) -> None:
+        """Register any ``__arrow_c_stream__`` source as a table.
+
+        Any Python object implementing the Arrow ``__arrow_c_stream__`` protocol
+        can be registered, including objects from libraries such as nanoarrow,
+        Polars, DuckDB, or :py:mod:`pyarrow.dataset`.
 
         Args:
             name: Name of the table to register.
-            dataset: PyArrow dataset.
+            dataset: Object exposing ``__arrow_c_stream__``.
         """
         self.ctx.register_dataset(name, dataset)
 
@@ -1009,7 +1015,7 @@ def read_json(
         schema: pa.Schema | None = None,
         schema_infer_max_records: int = 1000,
         file_extension: str = ".json",
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         file_compression_type: str | None = None,
     ) -> DataFrame:
         """Read a line-delimited JSON data source.
@@ -1049,7 +1055,7 @@ def read_csv(
         delimiter: str = ",",
         schema_infer_max_records: int = 1000,
         file_extension: str = ".csv",
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         file_compression_type: str | None = None,
     ) -> DataFrame:
         """Read a CSV data source.
@@ -1094,7 +1100,7 @@ def read_csv(
     def read_parquet(
         self,
         path: str | pathlib.Path,
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         parquet_pruning: bool = True,
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
@@ -1145,7 +1151,7 @@ def read_avro(
         self,
         path: str | pathlib.Path,
         schema: pa.Schema | None = None,
-        file_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        file_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         file_extension: str = ".avro",
     ) -> DataFrame:
         """Create a :py:class:`DataFrame` for reading Avro data source.
@@ -1181,26 +1187,27 @@ def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream:
 
     @staticmethod
     def _convert_table_partition_cols(
-        table_partition_cols: list[tuple[str, str | pa.DataType]],
-    ) -> list[tuple[str, pa.DataType]]:
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]],
+    ) -> list[tuple[str, Any]]:
         warn = False
         converted_table_partition_cols = []
 
         for col, data_type in table_partition_cols:
             if isinstance(data_type, str):
                 warn = True
                 if data_type == "string":
-                    converted_data_type = pa.string()
+                    mapped = DataTypeMap.py_map_from_arrow_type_str("utf8")
                 elif data_type == "int":
-                    converted_data_type = pa.int32()
+                    mapped = DataTypeMap.py_map_from_arrow_type_str("int32")
                 else:
                     message = (
                         f"Unsupported literal data type '{data_type}' for partition "
                         "column. Supported types are 'string' and 'int'"
                     )
                     raise ValueError(message)
+                converted_data_type = ensure_pyarrow_type(mapped)
             else:
-                converted_data_type = data_type
+                converted_data_type = ensure_pyarrow_type(data_type)
 
             converted_table_partition_cols.append((col, converted_data_type))