refactor: enhance type handling in SessionContext and add pyarrow type helpers

kosiew · kosiew · commit aa23651162b0 · 2025-09-04T22:17:42.000+08:00
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -22,7 +22,8 @@
 import warnings
 from typing import TYPE_CHECKING, Any, Protocol
 
-import pyarrow as pa
+from datafusion.types import ensure_pyarrow_type
+from datafusion.common import DataTypeMap
 
 try:
     from warnings import deprecated  # Python 3.13+
@@ -45,6 +46,7 @@
 
     import pandas as pd
     import polars as pl
+    import pyarrow as pa
 
     from datafusion.plan import ExecutionPlan, LogicalPlan
 
@@ -550,7 +552,7 @@ def register_listing_table(
         self,
         name: str,
         path: str | pathlib.Path,
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         file_extension: str = ".parquet",
         schema: pa.Schema | None = None,
         file_sort_order: list[list[Expr | SortExpr]] | None = None,
@@ -803,7 +805,7 @@ def register_parquet(
         self,
         name: str,
         path: str | pathlib.Path,
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         parquet_pruning: bool = True,
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
@@ -895,7 +897,7 @@ def register_json(
         schema: pa.Schema | None = None,
         schema_infer_max_records: int = 1000,
         file_extension: str = ".json",
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         file_compression_type: str | None = None,
     ) -> None:
         """Register a JSON file as a table.
@@ -933,7 +935,7 @@ def register_avro(
         path: str | pathlib.Path,
         schema: pa.Schema | None = None,
         file_extension: str = ".avro",
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
     ) -> None:
         """Register an Avro file as a table.
 
@@ -1009,7 +1011,7 @@ def read_json(
         schema: pa.Schema | None = None,
         schema_infer_max_records: int = 1000,
         file_extension: str = ".json",
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         file_compression_type: str | None = None,
     ) -> DataFrame:
         """Read a line-delimited JSON data source.
@@ -1049,7 +1051,7 @@ def read_csv(
         delimiter: str = ",",
         schema_infer_max_records: int = 1000,
         file_extension: str = ".csv",
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         file_compression_type: str | None = None,
     ) -> DataFrame:
         """Read a CSV data source.
@@ -1094,7 +1096,7 @@ def read_csv(
     def read_parquet(
         self,
         path: str | pathlib.Path,
-        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         parquet_pruning: bool = True,
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
@@ -1145,7 +1147,7 @@ def read_avro(
         self,
         path: str | pathlib.Path,
         schema: pa.Schema | None = None,
-        file_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
+        file_partition_cols: list[tuple[str, str | DataTypeMap | Any]] | None = None,
         file_extension: str = ".avro",
     ) -> DataFrame:
         """Create a :py:class:`DataFrame` for reading Avro data source.
@@ -1181,26 +1183,27 @@ def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream:
 
     @staticmethod
     def _convert_table_partition_cols(
-        table_partition_cols: list[tuple[str, str | pa.DataType]],
-    ) -> list[tuple[str, pa.DataType]]:
+        table_partition_cols: list[tuple[str, str | DataTypeMap | Any]],
+    ) -> list[tuple[str, Any]]:
         warn = False
         converted_table_partition_cols = []
 
         for col, data_type in table_partition_cols:
             if isinstance(data_type, str):
                 warn = True
                 if data_type == "string":
-                    converted_data_type = pa.string()
+                    mapped = DataTypeMap.py_map_from_arrow_type_str("utf8")
                 elif data_type == "int":
-                    converted_data_type = pa.int32()
+                    mapped = DataTypeMap.py_map_from_arrow_type_str("int32")
                 else:
                     message = (
                         f"Unsupported literal data type '{data_type}' for partition "
                         "column. Supported types are 'string' and 'int'"
                     )
                     raise ValueError(message)
+                converted_data_type = ensure_pyarrow_type(mapped)
             else:
-                converted_data_type = data_type
+                converted_data_type = ensure_pyarrow_type(data_type)
 
             converted_table_partition_cols.append((col, converted_data_type))
 
diff --git a/python/datafusion/types.py b/python/datafusion/types.py
@@ -0,0 +1,75 @@
+"""Internal Arrow type helpers with optional PyArrow conversion."""
+
+from __future__ import annotations
+
+from typing import Any
+
+try:  # pragma: no cover - optional dependency
+    import pyarrow as pa
+except Exception:  # pragma: no cover - optional dependency
+    pa = None  # type: ignore
+
+from datafusion.common import DataTypeMap
+
+_PYARROW_TYPE_FACTORIES = {
+    "Null": lambda: pa.null() if pa else None,
+    "Boolean": lambda: pa.bool_() if pa else None,
+    "Int8": lambda: pa.int8() if pa else None,
+    "Int16": lambda: pa.int16() if pa else None,
+    "Int32": lambda: pa.int32() if pa else None,
+    "Int64": lambda: pa.int64() if pa else None,
+    "UInt8": lambda: pa.uint8() if pa else None,
+    "UInt16": lambda: pa.uint16() if pa else None,
+    "UInt32": lambda: pa.uint32() if pa else None,
+    "UInt64": lambda: pa.uint64() if pa else None,
+    "Float16": lambda: pa.float16() if pa else None,
+    "Float32": lambda: pa.float32() if pa else None,
+    "Float64": lambda: pa.float64() if pa else None,
+    "Utf8": lambda: pa.string() if pa else None,
+}
+
+
+def pyarrow_available() -> bool:
+    """Return ``True`` if :mod:`pyarrow` can be imported."""
+
+    return pa is not None
+
+
+def to_pyarrow(data_type: DataTypeMap) -> "pa.DataType":
+    """Convert a :class:`DataTypeMap` to a :mod:`pyarrow` data type.
+
+    Raises ``ModuleNotFoundError`` if :mod:`pyarrow` is not installed.
+    """
+
+    if pa is None:  # pragma: no cover - optional dependency
+        raise ModuleNotFoundError("pyarrow is not installed")
+    name = str(data_type.arrow_type)
+    factory = _PYARROW_TYPE_FACTORIES.get(name)
+    if factory is None:
+        msg = f"Conversion to pyarrow for '{name}' is not implemented"
+        raise NotImplementedError(msg)
+    return factory()
+
+
+def from_pyarrow(pa_type: "pa.DataType") -> DataTypeMap:
+    """Convert a :mod:`pyarrow` data type to :class:`DataTypeMap`.
+
+    Raises ``ModuleNotFoundError`` if :mod:`pyarrow` is not installed.
+    """
+
+    if pa is None:  # pragma: no cover - optional dependency
+        raise ModuleNotFoundError("pyarrow is not installed")
+    return DataTypeMap.py_map_from_arrow_type_str(str(pa_type))
+
+
+def ensure_pyarrow_type(value: DataTypeMap | Any) -> Any:
+    """Ensure ``value`` is a :mod:`pyarrow` data type if available.
+
+    If ``value`` is a :class:`DataTypeMap` and :mod:`pyarrow` is installed,
+    it will be converted to the corresponding :mod:`pyarrow` data type.
+    Otherwise ``value`` is returned unchanged.
+    """
+
+    if isinstance(value, DataTypeMap):
+        return to_pyarrow(value) if pyarrow_available() else value
+    return value