Skip to content

Commit 76084a8

Browse files
author
Somasundaram Sekar
committed
fix: Allow writing data with missing optional map fields
When writing data to a table with an optional map field, the schema compatibility check incorrectly failed if the data was missing that field. This happened because the validator descended into the map's internal key field (which is always required) even when the parent map field was optional and missing. The fix modifies _SchemaCompatibilityVisitor.field() to skip child validation when an optional parent field is missing from the provided schema. Closes #2684
1 parent abae20f commit 76084a8

File tree

2 files changed

+117
-1
lines changed

2 files changed

+117
-1
lines changed

pyiceberg/schema.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1783,7 +1783,18 @@ def struct(self, struct: StructType, field_results: builtins.list[Callable[[], b
17831783
return all(results)
17841784

17851785
def field(self, field: NestedField, field_result: Callable[[], bool]) -> bool:
1786-
return self._is_field_compatible(field) and field_result()
1786+
is_compatible = self._is_field_compatible(field)
1787+
# Check if field exists in provided schema
1788+
try:
1789+
self.provided_schema.find_field(field.field_id)
1790+
field_exists = True
1791+
except ValueError:
1792+
field_exists = False
1793+
# If field doesn't exist (and is optional), skip child validation
1794+
# Children's required fields shouldn't be validated for a missing optional parent
1795+
if not field_exists:
1796+
return is_compatible
1797+
return is_compatible and field_result()
17871798

17881799
def list(self, list_type: ListType, element_result: Callable[[], bool]) -> bool:
17891800
return self._is_field_compatible(list_type.element_field) and element_result()

tests/test_schema.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from pyiceberg.schema import (
2626
Accessor,
2727
Schema,
28+
_check_schema_compatible,
2829
build_position_accessors,
2930
index_by_id,
3031
index_by_name,
@@ -1687,3 +1688,107 @@ def test_arrow_schema() -> None:
16871688
)
16881689

16891690
assert base_schema.as_arrow() == expected_schema
1691+
1692+
1693+
def test_check_schema_compatible_optional_map_field_missing() -> None:
1694+
"""Test that optional map field missing from provided schema is compatible (issue #2684)."""
1695+
requested_schema = Schema(
1696+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1697+
NestedField(
1698+
field_id=2,
1699+
name="data",
1700+
field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()),
1701+
required=False, # Optional map field
1702+
),
1703+
)
1704+
# Provided schema is missing the optional map field
1705+
provided_schema = Schema(
1706+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1707+
)
1708+
# Should not raise - optional field can be missing
1709+
_check_schema_compatible(requested_schema, provided_schema)
1710+
1711+
1712+
def test_check_schema_compatible_required_map_field_missing() -> None:
1713+
"""Test that required map field missing from provided schema raises error."""
1714+
requested_schema = Schema(
1715+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1716+
NestedField(
1717+
field_id=2,
1718+
name="data",
1719+
field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()),
1720+
required=True, # Required map field
1721+
),
1722+
)
1723+
# Provided schema is missing the required map field
1724+
provided_schema = Schema(
1725+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1726+
)
1727+
# Should raise - required field cannot be missing
1728+
with pytest.raises(ValueError, match="Mismatch in fields"):
1729+
_check_schema_compatible(requested_schema, provided_schema)
1730+
1731+
1732+
def test_check_schema_compatible_optional_list_field_missing() -> None:
1733+
"""Test that optional list field missing from provided schema is compatible."""
1734+
requested_schema = Schema(
1735+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1736+
NestedField(
1737+
field_id=2,
1738+
name="items",
1739+
field_type=ListType(element_id=3, element_type=StringType(), element_required=True),
1740+
required=False, # Optional list field
1741+
),
1742+
)
1743+
# Provided schema is missing the optional list field
1744+
provided_schema = Schema(
1745+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1746+
)
1747+
# Should not raise - optional field can be missing
1748+
_check_schema_compatible(requested_schema, provided_schema)
1749+
1750+
1751+
def test_check_schema_compatible_optional_struct_field_missing() -> None:
1752+
"""Test that optional struct field missing from provided schema is compatible."""
1753+
requested_schema = Schema(
1754+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1755+
NestedField(
1756+
field_id=2,
1757+
name="details",
1758+
field_type=StructType(
1759+
NestedField(field_id=3, name="name", field_type=StringType(), required=True),
1760+
NestedField(field_id=4, name="count", field_type=IntegerType(), required=True),
1761+
),
1762+
required=False, # Optional struct field
1763+
),
1764+
)
1765+
# Provided schema is missing the optional struct field
1766+
provided_schema = Schema(
1767+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1768+
)
1769+
# Should not raise - optional field can be missing
1770+
_check_schema_compatible(requested_schema, provided_schema)
1771+
1772+
1773+
def test_check_schema_compatible_optional_map_field_present() -> None:
1774+
"""Test that optional map field present in provided schema is compatible."""
1775+
requested_schema = Schema(
1776+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1777+
NestedField(
1778+
field_id=2,
1779+
name="data",
1780+
field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()),
1781+
required=False,
1782+
),
1783+
)
1784+
provided_schema = Schema(
1785+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
1786+
NestedField(
1787+
field_id=2,
1788+
name="data",
1789+
field_type=MapType(key_id=3, key_type=StringType(), value_id=4, value_type=StringType()),
1790+
required=False,
1791+
),
1792+
)
1793+
# Should not raise - schemas match
1794+
_check_schema_compatible(requested_schema, provided_schema)

0 commit comments

Comments
 (0)