145145 from pyiceberg_core .datafusion import IcebergDataFusionTable
146146
147147 from pyiceberg .catalog import Catalog
148+ from pyiceberg .catalog .rest .scan_planning import (
149+ RESTContentFile ,
150+ RESTDeleteFile ,
151+ RESTFileScanTask ,
152+ )
148153
149154ALWAYS_TRUE = AlwaysTrue ()
150155DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE = "downcast-ns-timestamp-to-us-on-write"
@@ -1168,6 +1173,8 @@ def scan(
11681173 snapshot_id = snapshot_id ,
11691174 options = options ,
11701175 limit = limit ,
1176+ catalog = self .catalog ,
1177+ table_identifier = self ._identifier ,
11711178 )
11721179
11731180 @property
@@ -1684,6 +1691,8 @@ class TableScan(ABC):
16841691 snapshot_id : int | None
16851692 options : Properties
16861693 limit : int | None
1694+ catalog : Catalog | None
1695+ table_identifier : Identifier | None
16871696
16881697 def __init__ (
16891698 self ,
@@ -1695,6 +1704,8 @@ def __init__(
16951704 snapshot_id : int | None = None ,
16961705 options : Properties = EMPTY_DICT ,
16971706 limit : int | None = None ,
1707+ catalog : Catalog | None = None ,
1708+ table_identifier : Identifier | None = None ,
16981709 ):
16991710 self .table_metadata = table_metadata
17001711 self .io = io
@@ -1704,6 +1715,8 @@ def __init__(
17041715 self .snapshot_id = snapshot_id
17051716 self .options = options
17061717 self .limit = limit
1718+ self .catalog = catalog
1719+ self .table_identifier = table_identifier
17071720
17081721 def snapshot (self ) -> Snapshot | None :
17091722 if self .snapshot_id :
@@ -1798,6 +1811,67 @@ def __init__(
17981811 self .delete_files = delete_files or set ()
17991812 self .residual = residual
18001813
1814+ @staticmethod
1815+ def from_rest_response (
1816+ rest_task : RESTFileScanTask ,
1817+ delete_files : list [RESTDeleteFile ],
1818+ ) -> FileScanTask :
1819+ """Convert a RESTFileScanTask to a FileScanTask.
1820+
1821+ Args:
1822+ rest_task: The REST file scan task.
1823+ delete_files: The list of delete files from the ScanTasks response.
1824+
1825+ Returns:
1826+ A FileScanTask with the converted data and delete files.
1827+
1828+ Raises:
1829+ NotImplementedError: If equality delete files are encountered.
1830+ """
1831+ from pyiceberg .catalog .rest .scan_planning import RESTEqualityDeleteFile
1832+
1833+ data_file = _rest_file_to_data_file (rest_task .data_file , include_stats = True )
1834+
1835+ resolved_deletes : set [DataFile ] = set ()
1836+ if rest_task .delete_file_references :
1837+ for idx in rest_task .delete_file_references :
1838+ delete_file = delete_files [idx ]
1839+ if isinstance (delete_file , RESTEqualityDeleteFile ):
1840+ raise NotImplementedError (f"PyIceberg does not yet support equality deletes: { delete_file .file_path } " )
1841+ resolved_deletes .add (_rest_file_to_data_file (delete_file , include_stats = False ))
1842+
1843+ return FileScanTask (
1844+ data_file = data_file ,
1845+ delete_files = resolved_deletes ,
1846+ residual = rest_task .residual_filter if rest_task .residual_filter else ALWAYS_TRUE ,
1847+ )
1848+
1849+
1850+ def _rest_file_to_data_file (rest_file : RESTContentFile , * , include_stats : bool ) -> DataFile :
1851+ """Convert a REST content file to a manifest DataFile."""
1852+ from pyiceberg .catalog .rest .scan_planning import CONTENT_TYPE_MAP
1853+
1854+ column_sizes = getattr (rest_file , "column_sizes" , None )
1855+ value_counts = getattr (rest_file , "value_counts" , None )
1856+ null_value_counts = getattr (rest_file , "null_value_counts" , None )
1857+ nan_value_counts = getattr (rest_file , "nan_value_counts" , None )
1858+
1859+ return DataFile .from_args (
1860+ content = CONTENT_TYPE_MAP [rest_file .content ],
1861+ file_path = rest_file .file_path ,
1862+ file_format = rest_file .file_format ,
1863+ partition = Record (* rest_file .partition ),
1864+ record_count = rest_file .record_count ,
1865+ file_size_in_bytes = rest_file .file_size_in_bytes ,
1866+ column_sizes = column_sizes .to_dict () if include_stats and column_sizes else None ,
1867+ value_counts = value_counts .to_dict () if include_stats and value_counts else None ,
1868+ null_value_counts = null_value_counts .to_dict () if include_stats and null_value_counts else None ,
1869+ nan_value_counts = nan_value_counts .to_dict () if include_stats and nan_value_counts else None ,
1870+ split_offsets = rest_file .split_offsets ,
1871+ sort_order_id = rest_file .sort_order_id ,
1872+ spec_id = rest_file .spec_id ,
1873+ )
1874+
18011875
18021876def _open_manifest (
18031877 io : FileIO ,
@@ -1970,12 +2044,27 @@ def scan_plan_helper(self) -> Iterator[list[ManifestEntry]]:
19702044 ],
19712045 )
19722046
1973- def plan_files (self ) -> Iterable [FileScanTask ]:
1974- """Plans the relevant files by filtering on the PartitionSpecs.
2047+ def _should_use_rest_planning (self ) -> bool :
2048+ """Check if REST scan planning should be used for this scan."""
2049+ if self .catalog is None :
2050+ return False
2051+ return self .catalog .is_rest_scan_planning_enabled ()
2052+
2053+ def _plan_files_rest (self ) -> Iterable [FileScanTask ]:
2054+ """Plan files using REST server-side scan planning."""
2055+ from pyiceberg .catalog .rest .scan_planning import PlanTableScanRequest
2056+
2057+ request = PlanTableScanRequest (
2058+ snapshot_id = self .snapshot_id ,
2059+ select = list (self .selected_fields ) if self .selected_fields != ("*" ,) else None ,
2060+ filter = self .row_filter if self .row_filter != ALWAYS_TRUE else None ,
2061+ case_sensitive = self .case_sensitive ,
2062+ )
19752063
1976- Returns:
1977- List of FileScanTasks that contain both data and delete files.
1978- """
2064+ return self .catalog .plan_scan (self .table_identifier , request ) # type: ignore[union-attr]
2065+
2066+ def _plan_files_local (self ) -> Iterable [FileScanTask ]:
2067+ """Plan files locally by reading manifests."""
19792068 data_entries : list [ManifestEntry ] = []
19802069 positional_delete_entries = SortedList (key = lambda entry : entry .sequence_number or INITIAL_SEQUENCE_NUMBER )
19812070
@@ -2006,6 +2095,20 @@ def plan_files(self) -> Iterable[FileScanTask]:
20062095 for data_entry in data_entries
20072096 ]
20082097
2098+ def plan_files (self ) -> Iterable [FileScanTask ]:
2099+ """Plans the relevant files by filtering on the PartitionSpecs.
2100+
2101+ If the table comes from a REST catalog with scan planning enabled,
2102+ this will use server-side scan planning. Otherwise, it falls back
2103+ to local planning by reading manifests.
2104+
2105+ Returns:
2106+ List of FileScanTasks that contain both data and delete files.
2107+ """
2108+ if self ._should_use_rest_planning ():
2109+ return self ._plan_files_rest ()
2110+ return self ._plan_files_local ()
2111+
20092112 def to_arrow (self ) -> pa .Table :
20102113 """Read an Arrow table eagerly from this DataScan.
20112114
0 commit comments