support for old logic

trade6-bot · trade6-bot · commit 41e0fedc2918 · 2025-06-18T22:06:06.000+02:00
dasdas
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -19,8 +19,11 @@
 
 from __future__ import annotations
 
+import warnings
 from typing import TYPE_CHECKING, Any, Protocol
 
+import pyarrow as pa
+
 try:
     from warnings import deprecated  # Python 3.13+
 except ImportError:
@@ -535,7 +538,7 @@ def register_listing_table(
         self,
         name: str,
         path: str | pathlib.Path,
-        table_partition_cols: list[tuple[str, pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         file_extension: str = ".parquet",
         schema: pa.Schema | None = None,
         file_sort_order: list[list[Expr | SortExpr]] | None = None,
@@ -556,6 +559,7 @@ def register_listing_table(
         """
         if table_partition_cols is None:
             table_partition_cols = []
+        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
         file_sort_order_raw = (
             [sort_list_to_raw_sort_list(f) for f in file_sort_order]
             if file_sort_order is not None
@@ -774,7 +778,7 @@ def register_parquet(
         self,
         name: str,
         path: str | pathlib.Path,
-        table_partition_cols: list[tuple[str, pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         parquet_pruning: bool = True,
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
@@ -802,6 +806,7 @@ def register_parquet(
         """
         if table_partition_cols is None:
             table_partition_cols = []
+        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
         self.ctx.register_parquet(
             name,
             str(path),
@@ -865,7 +870,7 @@ def register_json(
         schema: pa.Schema | None = None,
         schema_infer_max_records: int = 1000,
         file_extension: str = ".json",
-        table_partition_cols: list[tuple[str, pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         file_compression_type: str | None = None,
     ) -> None:
         """Register a JSON file as a table.
@@ -886,6 +891,7 @@ def register_json(
         """
         if table_partition_cols is None:
             table_partition_cols = []
+        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
         self.ctx.register_json(
             name,
             str(path),
@@ -902,7 +908,7 @@ def register_avro(
         path: str | pathlib.Path,
         schema: pa.Schema | None = None,
         file_extension: str = ".avro",
-        table_partition_cols: list[tuple[str, pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
     ) -> None:
         """Register an Avro file as a table.
 
@@ -918,6 +924,7 @@ def register_avro(
         """
         if table_partition_cols is None:
             table_partition_cols = []
+        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
         self.ctx.register_avro(
             name, str(path), schema, file_extension, table_partition_cols
         )
@@ -977,7 +984,7 @@ def read_json(
         schema: pa.Schema | None = None,
         schema_infer_max_records: int = 1000,
         file_extension: str = ".json",
-        table_partition_cols: list[tuple[str, pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         file_compression_type: str | None = None,
     ) -> DataFrame:
         """Read a line-delimited JSON data source.
@@ -997,6 +1004,7 @@ def read_json(
         """
         if table_partition_cols is None:
             table_partition_cols = []
+        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
         return DataFrame(
             self.ctx.read_json(
                 str(path),
@@ -1016,7 +1024,7 @@ def read_csv(
         delimiter: str = ",",
         schema_infer_max_records: int = 1000,
         file_extension: str = ".csv",
-        table_partition_cols: list[tuple[str, pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         file_compression_type: str | None = None,
     ) -> DataFrame:
         """Read a CSV data source.
@@ -1041,6 +1049,7 @@ def read_csv(
         """
         if table_partition_cols is None:
             table_partition_cols = []
+        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
 
         path = [str(p) for p in path] if isinstance(path, list) else str(path)
 
@@ -1060,7 +1069,7 @@ def read_csv(
     def read_parquet(
         self,
         path: str | pathlib.Path,
-        table_partition_cols: list[tuple[str, pa.DataType]] | None = None,
+        table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         parquet_pruning: bool = True,
         file_extension: str = ".parquet",
         skip_metadata: bool = True,
@@ -1089,6 +1098,7 @@ def read_parquet(
         """
         if table_partition_cols is None:
             table_partition_cols = []
+        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
         file_sort_order = (
             [sort_list_to_raw_sort_list(f) for f in file_sort_order]
             if file_sort_order is not None
@@ -1142,3 +1152,35 @@ def read_table(self, table: Table) -> DataFrame:
     def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream:
         """Execute the ``plan`` and return the results."""
         return RecordBatchStream(self.ctx.execute(plan._raw_plan, partitions))
+
+    @staticmethod
+    def _convert_table_partition_cols(
+        table_partition_cols: list[tuple[str, str | pa.DataType]],
+    ) -> list[tuple[str, pa.DataType]]:
+        warn = False
+        converted_table_partition_cols = []
+
+        for col, data_type in table_partition_cols:
+            if isinstance(data_type, str):
+                warn = True
+                if data_type == "string":
+                    converted_data_type = pa.string()
+                elif data_type == "int":
+                    converted_data_type = pa.int32()
+                else:
+                    raise ValueError(
+                        f"Unsupported literal data type '{data_type}' for partition column. Supported types are 'string' and 'int'"
+                    )
+            else:
+                converted_data_type = data_type
+
+            converted_table_partition_cols.append((col, converted_data_type))
+
+        if warn:
+            warnings.warn(
+                "using literals for table_partition_cols data types is deprecated, use pyarrow types instead",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+
+        return converted_table_partition_cols
diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py
@@ -157,8 +157,8 @@ def test_register_parquet(ctx, tmp_path):
     assert result.to_pydict() == {"cnt": [100]}
 
 
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_register_parquet_partitioned(ctx, tmp_path, path_to_str):
+@pytest.mark.parametrize("path_to_str,legacy_data_type", [(True, False), (False, False), (False, True)] )
+def test_register_parquet_partitioned(ctx, tmp_path, path_to_str, legacy_data_type):
     dir_root = tmp_path / "dataset_parquet_partitioned"
     dir_root.mkdir(exist_ok=False)
     (dir_root / "grp=a").mkdir(exist_ok=False)
@@ -177,10 +177,12 @@ def test_register_parquet_partitioned(ctx, tmp_path, path_to_str):
 
     dir_root = str(dir_root) if path_to_str else dir_root
 
+    partition_data_type = 'string' if legacy_data_type else pa.string()
+
     ctx.register_parquet(
         "datapp",
         dir_root,
-        table_partition_cols=[("grp", pa.string())],
+        table_partition_cols=[("grp", partition_data_type)],
         parquet_pruning=True,
         file_extension=".parquet",
     )