@@ -2846,6 +2846,7 @@ def test_task_to_record_batches_nanos(format_version: TableVersion, tmpdir: str)
28462846 FileScanTask (data_file ),
28472847 bound_row_filter = AlwaysTrue (),
28482848 projected_schema = table_schema ,
2849+ table_schema = table_schema ,
28492850 projected_field_ids = {1 },
28502851 positional_deletes = None ,
28512852 case_sensitive = True ,
@@ -4590,3 +4591,73 @@ def test_orc_stripe_based_batching(tmp_path: Path) -> None:
45904591 # Verify total rows
45914592 total_rows = sum (batch .num_rows for batch in batches )
45924593 assert total_rows == 10000 , f"Expected 10000 total rows, got { total_rows } "
4594+
4595+
4596+ def test_partition_column_projection_with_schema_evolution (catalog : InMemoryCatalog ) -> None :
4597+ """Test column projection on partitioned table after schema evolution (https://github.com/apache/iceberg-python/issues/2672)."""
4598+ initial_schema = Schema (
4599+ NestedField (1 , "partition_date" , DateType (), required = False ),
4600+ NestedField (2 , "id" , IntegerType (), required = False ),
4601+ NestedField (3 , "name" , StringType (), required = False ),
4602+ NestedField (4 , "value" , IntegerType (), required = False ),
4603+ )
4604+
4605+ partition_spec = PartitionSpec (
4606+ PartitionField (
4607+ source_id = 1 ,
4608+ field_id = 1000 ,
4609+ transform = IdentityTransform (),
4610+ name = "partition_date"
4611+ ),
4612+ )
4613+
4614+ catalog .create_namespace ("default" )
4615+ table = catalog .create_table (
4616+ "default.test_schema_evolution_projection" ,
4617+ schema = initial_schema ,
4618+ partition_spec = partition_spec ,
4619+ )
4620+
4621+ data_v1 = pa .Table .from_pylist (
4622+ [
4623+ {"partition_date" : date (2024 , 1 , 1 ), "id" : 1 , "name" : "Alice" , "value" : 100 },
4624+ {"partition_date" : date (2024 , 1 , 1 ), "id" : 2 , "name" : "Bob" , "value" : 200 },
4625+ ],
4626+ schema = pa .schema ([
4627+ ("partition_date" , pa .date32 ()),
4628+ ("id" , pa .int32 ()),
4629+ ("name" , pa .string ()),
4630+ ("value" , pa .int32 ()),
4631+ ])
4632+ )
4633+
4634+ table .append (data_v1 )
4635+
4636+ with table .update_schema () as update :
4637+ update .add_column ("new_column" , StringType ())
4638+
4639+ table = catalog .load_table ("default.test_schema_evolution_projection" )
4640+
4641+ data_v2 = pa .Table .from_pylist (
4642+ [
4643+ {"partition_date" : date (2024 , 1 , 2 ), "id" : 3 , "name" : "Charlie" , "value" : 300 , "new_column" : "new1" },
4644+ {"partition_date" : date (2024 , 1 , 2 ), "id" : 4 , "name" : "David" , "value" : 400 , "new_column" : "new2" },
4645+ ],
4646+ schema = pa .schema ([
4647+ ("partition_date" , pa .date32 ()),
4648+ ("id" , pa .int32 ()),
4649+ ("name" , pa .string ()),
4650+ ("value" , pa .int32 ()),
4651+ ("new_column" , pa .string ()),
4652+ ])
4653+ )
4654+
4655+ table .append (data_v2 )
4656+
4657+ result = table .scan (selected_fields = ("id" , "name" , "value" , "new_column" )).to_arrow ()
4658+
4659+ assert set (result .schema .names ) == {"id" , "name" , "value" , "new_column" }
4660+ assert result .num_rows == 4
4661+ result_sorted = result .sort_by ("name" )
4662+ assert result_sorted ["name" ].to_pylist () == ["Alice" , "Bob" , "Charlie" , "David" ]
4663+ assert result_sorted ["new_column" ].to_pylist () == [None , None , "new1" , "new2" ]
0 commit comments