Skip to content

Commit 9b36337

Browse files
committed
feat: Add Iceberg v3 geospatial primitive types (geometry and geography)
Implement support for Iceberg v3 geospatial types as specified in the Iceberg specification: - Add GeometryType(crs) and GeographyType(crs, algorithm) to types.py - Default CRS is "OGC:CRS84", default algorithm is "spherical" - Types require format version 3 (minimum_format_version() returns 3) - Values are stored as WKB (Well-Known Binary) bytes at runtime - Avro schema conversion maps to "bytes" - PyArrow conversion maps to large_binary() - Add type string parsing for geometry('CRS') and geography('CRS', 'algo') - Add visitor pattern support in schema.py and resolver.py Note: JSON single-value encoding (WKB<->WKT) raises NotImplementedError as it requires external libraries (e.g., Shapely) which are not included to avoid heavy dependencies.
1 parent 75ef45d commit 9b36337

File tree

7 files changed

+591
-0
lines changed

7 files changed

+591
-0
lines changed

pyiceberg/avro/resolver.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@
8787
DoubleType,
8888
FixedType,
8989
FloatType,
90+
GeographyType,
91+
GeometryType,
9092
IcebergType,
9193
IntegerType,
9294
ListType,
@@ -204,6 +206,14 @@ def visit_binary(self, binary_type: BinaryType) -> Writer:
204206
def visit_unknown(self, unknown_type: UnknownType) -> Writer:
205207
return UnknownWriter()
206208

209+
def visit_geometry(self, geometry_type: "GeometryType") -> Writer:
210+
"""Geometry is written as WKB bytes in Avro."""
211+
return BinaryWriter()
212+
213+
def visit_geography(self, geography_type: "GeographyType") -> Writer:
214+
"""Geography is written as WKB bytes in Avro."""
215+
return BinaryWriter()
216+
207217

208218
CONSTRUCT_WRITER_VISITOR = ConstructWriter()
209219

@@ -359,6 +369,14 @@ def visit_binary(self, binary_type: BinaryType, partner: IcebergType | None) ->
359369
def visit_unknown(self, unknown_type: UnknownType, partner: IcebergType | None) -> Writer:
360370
return UnknownWriter()
361371

372+
def visit_geometry(self, geometry_type: "GeometryType", partner: IcebergType | None) -> Writer:
373+
"""Geometry is written as WKB bytes in Avro."""
374+
return BinaryWriter()
375+
376+
def visit_geography(self, geography_type: "GeographyType", partner: IcebergType | None) -> Writer:
377+
"""Geography is written as WKB bytes in Avro."""
378+
return BinaryWriter()
379+
362380

363381
class ReadSchemaResolver(PrimitiveWithPartnerVisitor[IcebergType, Reader]):
364382
__slots__ = ("read_types", "read_enums", "context")
@@ -498,6 +516,14 @@ def visit_binary(self, binary_type: BinaryType, partner: IcebergType | None) ->
498516
def visit_unknown(self, unknown_type: UnknownType, partner: IcebergType | None) -> Reader:
499517
return UnknownReader()
500518

519+
def visit_geometry(self, geometry_type: "GeometryType", partner: IcebergType | None) -> Reader:
520+
"""Geometry is read as WKB bytes from Avro."""
521+
return BinaryReader()
522+
523+
def visit_geography(self, geography_type: "GeographyType", partner: IcebergType | None) -> Reader:
524+
"""Geography is read as WKB bytes from Avro."""
525+
return BinaryReader()
526+
501527

502528
class SchemaPartnerAccessor(PartnerAccessor[IcebergType]):
503529
def schema_partner(self, partner: IcebergType | None) -> IcebergType | None:

pyiceberg/conversions.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
DoubleType,
5050
FixedType,
5151
FloatType,
52+
GeographyType,
53+
GeometryType,
5254
IntegerType,
5355
LongType,
5456
PrimitiveType,
@@ -182,6 +184,18 @@ def _(type_: UnknownType, _: str) -> None:
182184
return None
183185

184186

187+
@partition_to_py.register(GeometryType)
188+
@partition_to_py.register(GeographyType)
189+
@handle_none
190+
def _(_: PrimitiveType, value_str: str) -> bytes:
191+
"""Convert a geometry/geography partition string to bytes.
192+
193+
Note: Partition values for geometry/geography types are expected to be
194+
hex-encoded WKB (Well-Known Binary) strings.
195+
"""
196+
return bytes.fromhex(value_str)
197+
198+
185199
@singledispatch
186200
def to_bytes(
187201
primitive_type: PrimitiveType, _: bool | bytes | Decimal | date | datetime | float | int | str | time | uuid.UUID
@@ -273,6 +287,8 @@ def _(_: UUIDType, value: uuid.UUID | bytes) -> bytes:
273287

274288
@to_bytes.register(BinaryType)
275289
@to_bytes.register(FixedType)
290+
@to_bytes.register(GeometryType)
291+
@to_bytes.register(GeographyType)
276292
def _(_: PrimitiveType, value: bytes) -> bytes:
277293
return value
278294

@@ -354,6 +370,8 @@ def _(_: StringType, b: bytes) -> str:
354370
@from_bytes.register(BinaryType)
355371
@from_bytes.register(FixedType)
356372
@from_bytes.register(UUIDType)
373+
@from_bytes.register(GeometryType)
374+
@from_bytes.register(GeographyType)
357375
def _(_: PrimitiveType, b: bytes) -> bytes:
358376
return b
359377

@@ -474,6 +492,40 @@ def _(_: UUIDType, val: uuid.UUID) -> str:
474492
return str(val)
475493

476494

495+
@to_json.register(GeometryType)
496+
def _(_: GeometryType, val: bytes) -> str:
497+
"""Serialize geometry to WKT string per Iceberg spec.
498+
499+
Note: This requires WKB to WKT conversion which is not yet implemented.
500+
The Iceberg spec requires geometry values to be serialized as WKT strings
501+
in JSON, but PyIceberg stores geometry as WKB bytes at runtime.
502+
503+
Raises:
504+
NotImplementedError: WKB to WKT conversion is not yet supported.
505+
"""
506+
raise NotImplementedError(
507+
"Geometry JSON serialization requires WKB to WKT conversion, which is not yet implemented. "
508+
"See https://iceberg.apache.org/spec/#json-single-value-serialization for spec details."
509+
)
510+
511+
512+
@to_json.register(GeographyType)
513+
def _(_: GeographyType, val: bytes) -> str:
514+
"""Serialize geography to WKT string per Iceberg spec.
515+
516+
Note: This requires WKB to WKT conversion which is not yet implemented.
517+
The Iceberg spec requires geography values to be serialized as WKT strings
518+
in JSON, but PyIceberg stores geography as WKB bytes at runtime.
519+
520+
Raises:
521+
NotImplementedError: WKB to WKT conversion is not yet supported.
522+
"""
523+
raise NotImplementedError(
524+
"Geography JSON serialization requires WKB to WKT conversion, which is not yet implemented. "
525+
"See https://iceberg.apache.org/spec/#json-single-value-serialization for spec details."
526+
)
527+
528+
477529
@singledispatch # type: ignore
478530
def from_json(primitive_type: PrimitiveType, val: Any) -> L: # type: ignore
479531
"""Convert JSON value types into built-in python values.
@@ -593,3 +645,43 @@ def _(_: UUIDType, val: str | bytes | uuid.UUID) -> uuid.UUID:
593645
return uuid.UUID(bytes=val)
594646
else:
595647
return val
648+
649+
650+
@from_json.register(GeometryType)
651+
def _(_: GeometryType, val: str | bytes) -> bytes:
652+
"""Convert JSON WKT string into WKB bytes per Iceberg spec.
653+
654+
Note: This requires WKT to WKB conversion which is not yet implemented.
655+
The Iceberg spec requires geometry values to be represented as WKT strings
656+
in JSON, but PyIceberg stores geometry as WKB bytes at runtime.
657+
658+
Raises:
659+
NotImplementedError: WKT to WKB conversion is not yet supported.
660+
"""
661+
if isinstance(val, bytes):
662+
# Already WKB bytes, return as-is
663+
return val
664+
raise NotImplementedError(
665+
"Geometry JSON deserialization requires WKT to WKB conversion, which is not yet implemented. "
666+
"See https://iceberg.apache.org/spec/#json-single-value-serialization for spec details."
667+
)
668+
669+
670+
@from_json.register(GeographyType)
671+
def _(_: GeographyType, val: str | bytes) -> bytes:
672+
"""Convert JSON WKT string into WKB bytes per Iceberg spec.
673+
674+
Note: This requires WKT to WKB conversion which is not yet implemented.
675+
The Iceberg spec requires geography values to be represented as WKT strings
676+
in JSON, but PyIceberg stores geography as WKB bytes at runtime.
677+
678+
Raises:
679+
NotImplementedError: WKT to WKB conversion is not yet supported.
680+
"""
681+
if isinstance(val, bytes):
682+
# Already WKB bytes, return as-is
683+
return val
684+
raise NotImplementedError(
685+
"Geography JSON deserialization requires WKT to WKB conversion, which is not yet implemented. "
686+
"See https://iceberg.apache.org/spec/#json-single-value-serialization for spec details."
687+
)

pyiceberg/io/pyarrow.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@
157157
DoubleType,
158158
FixedType,
159159
FloatType,
160+
GeographyType,
161+
GeometryType,
160162
IcebergType,
161163
IntegerType,
162164
ListType,
@@ -798,6 +800,26 @@ def visit_unknown(self, _: UnknownType) -> pa.DataType:
798800
def visit_binary(self, _: BinaryType) -> pa.DataType:
799801
return pa.large_binary()
800802

803+
def visit_geometry(self, geometry_type: GeometryType) -> pa.DataType:
804+
"""Convert geometry type to PyArrow binary.
805+
806+
Note: PyArrow 21.0.0+ supports native GEOMETRY logical type from Arrow GEO.
807+
For now, we use large_binary which stores WKB bytes.
808+
Future enhancement: detect PyArrow version and use pa.geometry() when available.
809+
"""
810+
# TODO: When PyArrow 21.0.0+ is available, use pa.geometry() with CRS metadata
811+
return pa.large_binary()
812+
813+
def visit_geography(self, geography_type: GeographyType) -> pa.DataType:
814+
"""Convert geography type to PyArrow binary.
815+
816+
Note: PyArrow 21.0.0+ supports native GEOGRAPHY logical type from Arrow GEO.
817+
For now, we use large_binary which stores WKB bytes.
818+
Future enhancement: detect PyArrow version and use pa.geography() when available.
819+
"""
820+
# TODO: When PyArrow 21.0.0+ is available, use pa.geography() with CRS and algorithm metadata
821+
return pa.large_binary()
822+
801823

802824
def _convert_scalar(value: Any, iceberg_type: IcebergType) -> pa.scalar:
803825
if not isinstance(iceberg_type, PrimitiveType):
@@ -2111,6 +2133,12 @@ def visit_binary(self, binary_type: BinaryType) -> str:
21112133
def visit_unknown(self, unknown_type: UnknownType) -> str:
21122134
return "UNKNOWN"
21132135

2136+
def visit_geometry(self, geometry_type: GeometryType) -> str:
2137+
return "BYTE_ARRAY"
2138+
2139+
def visit_geography(self, geography_type: GeographyType) -> str:
2140+
return "BYTE_ARRAY"
2141+
21142142

21152143
_PRIMITIVE_TO_PHYSICAL_TYPE_VISITOR = PrimitiveToPhysicalType()
21162144

pyiceberg/schema.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
DoubleType,
4444
FixedType,
4545
FloatType,
46+
GeographyType,
47+
GeometryType,
4648
IcebergType,
4749
IntegerType,
4850
ListType,
@@ -553,6 +555,10 @@ def primitive(self, primitive: PrimitiveType, primitive_partner: P | None) -> T:
553555
return self.visit_binary(primitive, primitive_partner)
554556
elif isinstance(primitive, UnknownType):
555557
return self.visit_unknown(primitive, primitive_partner)
558+
elif isinstance(primitive, GeometryType):
559+
return self.visit_geometry(primitive, primitive_partner)
560+
elif isinstance(primitive, GeographyType):
561+
return self.visit_geography(primitive, primitive_partner)
556562
else:
557563
raise ValueError(f"Type not recognized: {primitive}")
558564

@@ -624,6 +630,14 @@ def visit_binary(self, binary_type: BinaryType, partner: P | None) -> T:
624630
def visit_unknown(self, unknown_type: UnknownType, partner: P | None) -> T:
625631
"""Visit a UnknownType."""
626632

633+
@abstractmethod
634+
def visit_geometry(self, geometry_type: GeometryType, partner: P | None) -> T:
635+
"""Visit a GeometryType."""
636+
637+
@abstractmethod
638+
def visit_geography(self, geography_type: GeographyType, partner: P | None) -> T:
639+
"""Visit a GeographyType."""
640+
627641

628642
class PartnerAccessor(Generic[P], ABC):
629643
@abstractmethod
@@ -747,6 +761,10 @@ def primitive(self, primitive: PrimitiveType) -> T:
747761
return self.visit_binary(primitive)
748762
elif isinstance(primitive, UnknownType):
749763
return self.visit_unknown(primitive)
764+
elif isinstance(primitive, GeometryType):
765+
return self.visit_geometry(primitive)
766+
elif isinstance(primitive, GeographyType):
767+
return self.visit_geography(primitive)
750768
else:
751769
raise ValueError(f"Type not recognized: {primitive}")
752770

@@ -818,6 +836,14 @@ def visit_binary(self, binary_type: BinaryType) -> T:
818836
def visit_unknown(self, unknown_type: UnknownType) -> T:
819837
"""Visit a UnknownType."""
820838

839+
@abstractmethod
840+
def visit_geometry(self, geometry_type: GeometryType) -> T:
841+
"""Visit a GeometryType."""
842+
843+
@abstractmethod
844+
def visit_geography(self, geography_type: GeographyType) -> T:
845+
"""Visit a GeographyType."""
846+
821847

822848
@dataclass(init=True, eq=True, frozen=True)
823849
class Accessor:

0 commit comments

Comments
 (0)