@@ -2846,6 +2846,7 @@ def test_task_to_record_batches_nanos(format_version: TableVersion, tmpdir: str)
28462846 FileScanTask (data_file ),
28472847 bound_row_filter = AlwaysTrue (),
28482848 projected_schema = table_schema ,
2849+ table_schema = table_schema ,
28492850 projected_field_ids = {1 },
28502851 positional_deletes = None ,
28512852 case_sensitive = True ,
@@ -4590,3 +4591,72 @@ def test_orc_stripe_based_batching(tmp_path: Path) -> None:
45904591 # Verify total rows
45914592 total_rows = sum (batch .num_rows for batch in batches )
45924593 assert total_rows == 10000 , f"Expected 10000 total rows, got { total_rows } "
4594+
4595+
4596+ def test_partition_column_projection_with_schema_evolution (catalog : InMemoryCatalog ) -> None :
4597+ """Test column projection on partitioned table after schema evolution (https://github.com/apache/iceberg-python/issues/2672)."""
4598+ initial_schema = Schema (
4599+ NestedField (1 , "partition_date" , DateType (), required = False ),
4600+ NestedField (2 , "id" , IntegerType (), required = False ),
4601+ NestedField (3 , "name" , StringType (), required = False ),
4602+ NestedField (4 , "value" , IntegerType (), required = False ),
4603+ )
4604+
4605+ partition_spec = PartitionSpec (
4606+ PartitionField (source_id = 1 , field_id = 1000 , transform = IdentityTransform (), name = "partition_date" ),
4607+ )
4608+
4609+ catalog .create_namespace ("default" )
4610+ table = catalog .create_table (
4611+ "default.test_schema_evolution_projection" ,
4612+ schema = initial_schema ,
4613+ partition_spec = partition_spec ,
4614+ )
4615+
4616+ data_v1 = pa .Table .from_pylist (
4617+ [
4618+ {"partition_date" : date (2024 , 1 , 1 ), "id" : 1 , "name" : "Alice" , "value" : 100 },
4619+ {"partition_date" : date (2024 , 1 , 1 ), "id" : 2 , "name" : "Bob" , "value" : 200 },
4620+ ],
4621+ schema = pa .schema (
4622+ [
4623+ ("partition_date" , pa .date32 ()),
4624+ ("id" , pa .int32 ()),
4625+ ("name" , pa .string ()),
4626+ ("value" , pa .int32 ()),
4627+ ]
4628+ ),
4629+ )
4630+
4631+ table .append (data_v1 )
4632+
4633+ with table .update_schema () as update :
4634+ update .add_column ("new_column" , StringType ())
4635+
4636+ table = catalog .load_table ("default.test_schema_evolution_projection" )
4637+
4638+ data_v2 = pa .Table .from_pylist (
4639+ [
4640+ {"partition_date" : date (2024 , 1 , 2 ), "id" : 3 , "name" : "Charlie" , "value" : 300 , "new_column" : "new1" },
4641+ {"partition_date" : date (2024 , 1 , 2 ), "id" : 4 , "name" : "David" , "value" : 400 , "new_column" : "new2" },
4642+ ],
4643+ schema = pa .schema (
4644+ [
4645+ ("partition_date" , pa .date32 ()),
4646+ ("id" , pa .int32 ()),
4647+ ("name" , pa .string ()),
4648+ ("value" , pa .int32 ()),
4649+ ("new_column" , pa .string ()),
4650+ ]
4651+ ),
4652+ )
4653+
4654+ table .append (data_v2 )
4655+
4656+ result = table .scan (selected_fields = ("id" , "name" , "value" , "new_column" )).to_arrow ()
4657+
4658+ assert set (result .schema .names ) == {"id" , "name" , "value" , "new_column" }
4659+ assert result .num_rows == 4
4660+ result_sorted = result .sort_by ("name" )
4661+ assert result_sorted ["name" ].to_pylist () == ["Alice" , "Bob" , "Charlie" , "David" ]
4662+ assert result_sorted ["new_column" ].to_pylist () == [None , None , "new1" , "new2" ]
0 commit comments