diff --git a/changes/3781.feature.md b/changes/3781.feature.md new file mode 100644 index 0000000000..191ca5ed4a --- /dev/null +++ b/changes/3781.feature.md @@ -0,0 +1 @@ +Added `Struct` class (subclass of `Structured`) implementing the zarr-extensions `struct` dtype spec. Uses object-style field format and dict fill values. Legacy `Structured` remains available for backward compatibility. diff --git a/docs/user-guide/data_types.md b/docs/user-guide/data_types.md index aa19baf891..3cdafb5f28 100644 --- a/docs/user-guide/data_types.md +++ b/docs/user-guide/data_types.md @@ -229,6 +229,37 @@ here, it's possible to create it yourself: see [Adding New Data Types](#adding-n #### Struct-like - [Structured][zarr.dtype.Structured] +!!! note "Zarr V3 Structured Data Types" + + In Zarr V3, structured data types are specified using the `struct` extension defined in the + [zarr-extensions repository](https://github.com/zarr-developers/zarr-extensions/tree/main/data-types/struct). + The JSON representation uses an object format for fields: + + ```json + { + "name": "struct", + "configuration": { + "fields": [ + {"name": "x", "data_type": "float32"}, + {"name": "y", "data_type": "int64"} + ] + } + } + ``` + + For backward compatibility, Zarr Python also accepts the legacy `structured` name with + tuple-format fields when reading existing data. + + Fill values for structured types are represented as JSON objects mapping field names to values: + + ```json + {"x": 1.5, "y": 42} + ``` + + When using structured types with multi-byte fields, the `bytes` codec must specify an + explicit `endian` parameter. If omitted, Zarr Python assumes little-endian for legacy + compatibility but emits a warning. + ### Example Usage This section will demonstrates the basic usage of Zarr data types. diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 86bb354fb5..e40bee3b16 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -1,6 +1,7 @@ from __future__ import annotations import sys +import warnings from dataclasses import dataclass, replace from enum import Enum from typing import TYPE_CHECKING @@ -9,6 +10,7 @@ from zarr.core.buffer import Buffer, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.npy.structured import Structured if TYPE_CHECKING: from typing import Self @@ -56,7 +58,20 @@ def to_dict(self) -> dict[str, JSON]: return {"name": "bytes", "configuration": {"endian": self.endian.value}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: - if not isinstance(array_spec.dtype, HasEndianness): + if isinstance(array_spec.dtype, Structured): + if array_spec.dtype.has_multi_byte_fields(): + if self.endian is None: + warnings.warn( + "Missing 'endian' for structured dtype with multi-byte fields. " + "Assuming little-endian for legacy compatibility.", + UserWarning, + stacklevel=2, + ) + return replace(self, endian=Endian.little) + else: + if self.endian is not None: + return replace(self, endian=None) + elif not isinstance(array_spec.dtype, HasEndianness): if self.endian is not None: return replace(self, endian=None) elif self.endian is None: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 486216fa32..b210aab182 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -66,6 +66,7 @@ ) from zarr.core.config import config as zarr_config from zarr.core.dtype import ( + Structured, VariableLengthBytes, VariableLengthUTF8, ZDType, @@ -5067,10 +5068,13 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and ``VLenBytesCodec``, respectively. + Structured data types with multi-byte fields use ``BytesCodec`` with little-endian encoding. """ serializer: ArrayBytesCodec = BytesCodec(endian=None) - if isinstance(dtype, HasEndianness): + if isinstance(dtype, HasEndianness) or ( + isinstance(dtype, Structured) and dtype.has_multi_byte_fields() + ): serializer = BytesCodec(endian="little") elif isinstance(dtype, HasObjectCodec): if dtype.object_codec_id == "vlen-bytes": diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py index 1049a2063f..290a51d287 100644 --- a/src/zarr/core/dtype/__init__.py +++ b/src/zarr/core/dtype/__init__.py @@ -21,7 +21,13 @@ from zarr.core.dtype.npy.complex import Complex64, Complex128 from zarr.core.dtype.npy.float import Float16, Float32, Float64 from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64 -from zarr.core.dtype.npy.structured import Structured, StructuredJSON_V2, StructuredJSON_V3 +from zarr.core.dtype.npy.structured import ( + Struct, + StructJSON_V3, + Structured, + StructuredJSON_V2, + StructuredJSON_V3, +) from zarr.core.dtype.npy.time import ( DateTime64, DateTime64JSON_V2, @@ -75,6 +81,8 @@ "RawBytes", "RawBytesJSON_V2", "RawBytesJSON_V3", + "Struct", + "StructJSON_V3", "Structured", "StructuredJSON_V2", "StructuredJSON_V3", @@ -124,7 +132,7 @@ | ComplexFloatDType | StringDType | BytesDType - | Structured + | Struct | TimeDType | VariableLengthBytes ) @@ -137,7 +145,7 @@ *COMPLEX_FLOAT_DTYPE, *STRING_DTYPE, *BYTES_DTYPE, - Structured, + Struct, *TIME_DTYPE, VariableLengthBytes, ) @@ -155,6 +163,10 @@ # mypy does not know that all the elements of ANY_DTYPE are subclasses of ZDType data_type_registry.register(dtype._zarr_v3_name, dtype) # type: ignore[arg-type] +# Register Structured for reading legacy "structured" format JSON, but don't include it in +# ANY_DTYPE since it doesn't support native dtype matching (use Struct instead). +data_type_registry.register(Structured._zarr_v3_name, Structured) + # TODO: find a better name for this function def get_data_type_from_native_dtype(dtype: npt.DTypeLike) -> ZDType[TBaseDType, TBaseScalar]: diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index 8bedee07ef..2cb1be3295 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -61,11 +61,10 @@ class StructuredJSON_V3( NamedConfig[Literal["structured"], dict[str, Sequence[Sequence[str | DTypeJSON]]]] ): """ - A JSON representation of a structured data type in Zarr V3. + A JSON representation of a structured data type in Zarr V3 (legacy format). - References - ---------- - This representation is not currently defined in an external specification. + This is the legacy format using tuple-style field definitions. + For the canonical format, see ``StructJSON_V3``. Examples -------- @@ -83,14 +82,44 @@ class StructuredJSON_V3( """ +class StructJSON_V3( + NamedConfig[Literal["struct"], dict[str, Sequence[dict[str, str | DTypeJSON]]]] +): + """ + A JSON representation of a structured data type in Zarr V3 (canonical format). + + References + ---------- + The Zarr V3 specification for this data type is defined in the zarr-extensions repository: + https://github.com/zarr-developers/zarr-extensions/tree/main/data-types/struct + + Examples + -------- + ```python + { + "name": "struct", + "configuration": { + "fields": [ + {"name": "f0", "data_type": "int32"}, + {"name": "f1", "data_type": "float64"}, + ] + } + } + ``` + """ + + @dataclass(frozen=True, kw_only=True) class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): """ - A Zarr data type for arrays containing structured scalars, AKA "record arrays". + A Zarr data type for arrays containing structured scalars, AKA "record arrays" (legacy format). Wraps the NumPy `np.dtypes.VoidDType` if the data type has fields. Scalars for this data type are instances of `np.void`, with a ``fields`` attribute. + This class handles the legacy "structured" format with tuple-style field definitions. + For the canonical "struct" format, see ``Struct``. + Attributes ---------- fields : Sequence[tuple[str, ZDType]] @@ -98,8 +127,6 @@ class Structured(ZDType[np.dtypes.VoidDType[int], np.void], HasItemSize): References ---------- - This data type does not have a Zarr V3 specification. - The Zarr V2 data type specification can be found [here](https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding). """ @@ -113,61 +140,11 @@ def __post_init__(self) -> None: @classmethod def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: - """ - Check that this dtype is a numpy structured dtype - - Parameters - ---------- - dtype : np.dtypes.DTypeLike - The dtype to check. - - Returns - ------- - TypeGuard[np.dtypes.VoidDType] - True if the dtype matches, False otherwise. - """ - return isinstance(dtype, cls.dtype_cls) and dtype.fields is not None + return False @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: - """ - Create a Structured ZDType from a native NumPy data type. - - Parameters - ---------- - dtype : TBaseDType - The native data type. - - Returns - ------- - Self - An instance of this data type. - - Raises - ------ - DataTypeValidationError - If the input data type is not an instance of np.dtypes.VoidDType with a non-null - ``fields`` attribute. - - Notes - ----- - This method attempts to resolve the fields of the structured dtype using the data type - registry. - """ - from zarr.core.dtype import get_data_type_from_native_dtype - - fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] - if cls._check_native_dtype(dtype): - # fields of a structured numpy dtype are either 2-tuples or 3-tuples. we only - # care about the first element in either case. - for key, (dtype_instance, *_) in dtype.fields.items(): # type: ignore[union-attr] - dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) - fields.append((key, dtype_wrapped)) - - return cls(fields=tuple(fields)) - raise DataTypeValidationError( - f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" - ) + raise DataTypeValidationError(f"Use 'Struct' for native dtype matching. Got: {dtype}") def to_native_dtype(self) -> np.dtypes.VoidDType[int]: """ @@ -234,7 +211,6 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructuredJSON_V3]: True if the input is a valid JSON representation of a structured data type for Zarr V3, False otherwise. """ - return ( isinstance(data, dict) and set(data.keys()) == {"name", "configuration"} @@ -245,13 +221,9 @@ def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructuredJSON_V3]: @classmethod def _from_json_v2(cls, data: DTypeJSON) -> Self: - # avoid circular import from zarr.core.dtype import get_data_type_from_json if cls._check_json_v2(data): - # structured dtypes are constructed directly from a list of lists - # note that we do not handle the object codec here! this will prevent structured - # dtypes from containing object dtypes. return cls( fields=tuple( # type: ignore[misc] ( # type: ignore[misc] @@ -268,18 +240,19 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self: @classmethod def _from_json_v3(cls, data: DTypeJSON) -> Self: - # avoid circular import from zarr.core.dtype import get_data_type_from_json if cls._check_json_v3(data): config = data["configuration"] meta_fields = config["fields"] - return cls( - fields=tuple( - (f_name, get_data_type_from_json(f_dtype, zarr_format=3)) # type: ignore[misc] - for f_name, f_dtype in meta_fields - ) - ) + parsed_fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] + for field in meta_fields: + if isinstance(field, dict): + msg = f"Invalid field format for 'structured' dtype. Expected [name, dtype] tuple, got {field!r}" + raise DataTypeValidationError(msg) + f_name, f_dtype = field + parsed_fields.append((f_name, get_data_type_from_json(f_dtype, zarr_format=3))) # type: ignore[arg-type] + return cls(fields=tuple(parsed_fields)) msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" raise DataTypeValidationError(msg) @@ -328,7 +301,6 @@ def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructuredJSON raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover def _check_scalar(self, data: object) -> TypeGuard[StructuredScalarLike]: - # TODO: implement something more precise here! """ Check that the input is a valid scalar value for this structured data type. @@ -425,7 +397,9 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: Parameters ---------- data : JSON - The JSON-serializable value. + The JSON-serializable value. Can be either: + - A dict mapping field names to values (primary format for V3) + - A base64-encoded string (legacy format, for backward compatibility) zarr_format : ZarrFormat The zarr format version. @@ -437,17 +411,27 @@ def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: Raises ------ TypeError - If the input is not a base64-encoded string. + If the input is not a dict or base64-encoded string. """ - if check_json_str(data): + if isinstance(data, dict): + field_values = [] + for field_name, field_dtype in self.fields: + if field_name in data: + field_values.append( + field_dtype.from_json_scalar(data[field_name], zarr_format=zarr_format) + ) + else: + field_values.append(field_dtype.default_scalar()) + return self._cast_scalar_unchecked(tuple(field_values)) + elif check_json_str(data): as_bytes = bytes_from_json(data, zarr_format=zarr_format) dtype = self.to_native_dtype() return cast("np.void", np.array([as_bytes]).view(dtype)[0]) - raise TypeError(f"Invalid type: {data}. Expected a string.") + raise TypeError(f"Invalid type: {data}. Expected a dict or base64-encoded string.") - def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: + def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str | dict[str, JSON]: """ - Convert a scalar to a JSON-serializable string representation. + Convert a scalar to a JSON-serializable representation. Parameters ---------- @@ -458,11 +442,19 @@ def to_json_scalar(self, data: object, *, zarr_format: ZarrFormat) -> str: Returns ------- - str - A string representation of the scalar, which is a base64-encoded - string of the bytes that make up the scalar. + str | dict[str, JSON] + For V2: A base64-encoded string of the bytes that make up the scalar. + For V3: A dict mapping field names to their JSON-serialized values. """ - return bytes_to_json(self.cast_scalar(data).tobytes(), zarr_format) + scalar = self.cast_scalar(data) + if zarr_format == 2: + return bytes_to_json(scalar.tobytes(), zarr_format) + result: dict[str, JSON] = {} + for field_name, field_dtype in self.fields: + result[field_name] = field_dtype.to_json_scalar( + scalar[field_name], zarr_format=zarr_format + ) + return result @property def item_size(self) -> int: @@ -475,3 +467,154 @@ def item_size(self) -> int: The size of a single scalar in bytes. """ return self.to_native_dtype().itemsize + + def has_multi_byte_fields(self) -> bool: + """ + Check if this structured dtype has any fields with item_size > 1. + + Returns + ------- + bool + True if any field has item_size > 1, False otherwise. + """ + return any( + isinstance(field_dtype, HasItemSize) and field_dtype.item_size > 1 + for _, field_dtype in self.fields + ) + + +@dataclass(frozen=True, kw_only=True) +class Struct(Structured): + """ + A Zarr data type for arrays containing structured scalars, AKA "record arrays". + + Wraps the NumPy `np.dtypes.VoidDType` if the data type has fields. Scalars for this data + type are instances of `np.void`, with a ``fields`` attribute. + + This class handles the canonical "struct" format with object-style field definitions. + For the legacy "structured" format, see ``Structured``. + + Attributes + ---------- + fields : Sequence[tuple[str, ZDType]] + The fields of the structured dtype. + + References + ---------- + The Zarr V3 specification for this data type is defined in the zarr-extensions repository: + https://github.com/zarr-developers/zarr-extensions/tree/main/data-types/struct + + The Zarr V2 data type specification can be found [here](https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding). + """ + + _zarr_v3_name: ClassVar[Literal["struct"]] = "struct" # type: ignore[assignment] + + @classmethod + def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]: + """ + Check that this dtype is a numpy structured dtype. + + Parameters + ---------- + dtype : np.dtypes.DTypeLike + The dtype to check. + + Returns + ------- + TypeGuard[np.dtypes.VoidDType] + True if the dtype matches, False otherwise. + """ + return isinstance(dtype, cls.dtype_cls) and dtype.fields is not None + + @classmethod + def from_native_dtype(cls, dtype: TBaseDType) -> Self: + """ + Create a Struct ZDType from a native NumPy data type. + + Parameters + ---------- + dtype : TBaseDType + The native data type. + + Returns + ------- + Self + An instance of this data type. + + Raises + ------ + DataTypeValidationError + If the input data type is not an instance of np.dtypes.VoidDType with a non-null + ``fields`` attribute. + + Notes + ----- + This method attempts to resolve the fields of the structured dtype using the data type + registry. + """ + from zarr.core.dtype import get_data_type_from_native_dtype + + fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] + if cls._check_native_dtype(dtype): + for key, (dtype_instance, *_) in dtype.fields.items(): # type: ignore[union-attr] + dtype_wrapped = get_data_type_from_native_dtype(dtype_instance) + fields.append((key, dtype_wrapped)) + + return cls(fields=tuple(fields)) + raise DataTypeValidationError( + f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}" + ) + + @classmethod + def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[StructJSON_V3]: # type: ignore[override] + return ( + isinstance(data, dict) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == cls._zarr_v3_name + and isinstance(data["configuration"], dict) + and set(data["configuration"].keys()) == {"fields"} + ) + + @classmethod + def _from_json_v3(cls, data: DTypeJSON) -> Self: + from zarr.core.dtype import get_data_type_from_json + + if cls._check_json_v3(data): + config = data["configuration"] + meta_fields = config["fields"] + parsed_fields: list[tuple[str, ZDType[TBaseDType, TBaseScalar]]] = [] + for field in meta_fields: + if not isinstance(field, dict): + msg = f"Invalid field format for 'struct' dtype. Expected object with 'name' and 'data_type' keys, got {field!r}" # type: ignore[unreachable] + raise DataTypeValidationError(msg) + f_name = field["name"] + f_dtype = field["data_type"] + parsed_fields.append((f_name, get_data_type_from_json(f_dtype, zarr_format=3))) # type: ignore[arg-type] + return cls(fields=tuple(parsed_fields)) + msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON object with the key {cls._zarr_v3_name!r}" + raise DataTypeValidationError(msg) + + @overload # type: ignore[override] + def to_json(self, zarr_format: Literal[2]) -> StructuredJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> StructJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructJSON_V3: + if zarr_format == 2: + fields_v2 = [ + [f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]] + for f_name, f_dtype in self.fields + ] + return {"name": fields_v2, "object_codec_id": None} + elif zarr_format == 3: + v3_unstable_dtype_warning(self) + fields_v3 = [ + {"name": f_name, "data_type": f_dtype.to_json(zarr_format=zarr_format)} + for f_name, f_dtype in self.fields + ] + return cast( + "StructJSON_V3", + {"name": self._zarr_v3_name, "configuration": {"fields": fields_v3}}, + ) + raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover diff --git a/src/zarr/dtype.py b/src/zarr/dtype.py index 2c7eb651b0..f75219aab8 100644 --- a/src/zarr/dtype.py +++ b/src/zarr/dtype.py @@ -22,6 +22,8 @@ RawBytes, RawBytesJSON_V2, RawBytesJSON_V3, + Struct, + StructJSON_V3, Structured, StructuredJSON_V2, StructuredJSON_V3, @@ -68,6 +70,8 @@ "RawBytes", "RawBytesJSON_V2", "RawBytesJSON_V3", + "Struct", + "StructJSON_V3", "Structured", "StructuredJSON_V2", "StructuredJSON_V3", diff --git a/tests/test_dtype/conftest.py b/tests/test_dtype/conftest.py index 0650d143c6..7bc309a371 100644 --- a/tests/test_dtype/conftest.py +++ b/tests/test_dtype/conftest.py @@ -6,19 +6,22 @@ from zarr.core.dtype import data_type_registry from zarr.core.dtype.common import HasLength -from zarr.core.dtype.npy.structured import Structured +from zarr.core.dtype.npy.structured import Struct, Structured from zarr.core.dtype.npy.time import DateTime64, TimeDelta64 from zarr.core.dtype.wrapper import ZDType zdtype_examples: tuple[ZDType[Any, Any], ...] = () for wrapper_cls in data_type_registry.contents.values(): - # The Structured dtype has to be constructed with some actual fields - if wrapper_cls is Structured: + # The Struct dtype has to be constructed with some actual fields + if wrapper_cls is Struct: with warnings.catch_warnings(): warnings.simplefilter("ignore") zdtype_examples += ( wrapper_cls.from_native_dtype(np.dtype([("a", np.float64), ("b", np.int8)])), ) + # The legacy Structured dtype doesn't support native dtype matching, skip it + elif wrapper_cls is Structured: + continue elif issubclass(wrapper_cls, HasLength): zdtype_examples += (wrapper_cls(length=1),) elif issubclass(wrapper_cls, DateTime64 | TimeDelta64): diff --git a/tests/test_dtype/test_npy/test_structured.py b/tests/test_dtype/test_npy/test_structured.py index e2cd2a6dfe..63ac98d1a4 100644 --- a/tests/test_dtype/test_npy/test_structured.py +++ b/tests/test_dtype/test_npy/test_structured.py @@ -11,12 +11,17 @@ Float64, Int32, Int64, + Struct, Structured, + UInt8, ) +from zarr.core.dtype.common import DataTypeValidationError -class TestStructured(BaseTestZDType): - test_cls = Structured +class TestStruct(BaseTestZDType): + """Test the canonical 'struct' dtype format.""" + + test_cls = Struct valid_dtype = ( np.dtype([("field1", np.int32), ("field2", np.float64)]), np.dtype([("field1", np.int64), ("field2", np.int32)]), @@ -32,29 +37,32 @@ class TestStructured(BaseTestZDType): ) valid_json_v3 = ( { - "name": "structured", + "name": "struct", "configuration": { "fields": [ - ["field1", "int32"], - ["field2", "float64"], + {"name": "field1", "data_type": "int32"}, + {"name": "field2", "data_type": "float64"}, ] }, }, { - "name": "structured", + "name": "struct", "configuration": { "fields": [ - [ - "field1", - { + { + "name": "field1", + "data_type": { "name": "numpy.datetime64", "configuration": {"unit": "s", "scale_factor": 1}, }, - ], - [ - "field2", - {"name": "fixed_length_utf32", "configuration": {"length_bytes": 32}}, - ], + }, + { + "name": "field2", + "data_type": { + "name": "fixed_length_utf32", + "configuration": {"length_bytes": 32}, + }, + }, ] }, }, @@ -65,7 +73,7 @@ class TestStructured(BaseTestZDType): ) invalid_json_v3 = ( { - "name": "structured", + "name": "struct", "configuration": { "fields": [ ("field1", {"name": "int32", "configuration": {"endianness": "invalid"}}), @@ -77,35 +85,38 @@ class TestStructured(BaseTestZDType): ) scalar_v2_params = ( - (Structured(fields=(("field1", Int32()), ("field2", Float64()))), "AQAAAAAAAAAAAPA/"), - (Structured(fields=(("field1", Float16()), ("field2", Int32()))), "AQAAAAAA"), + (Struct(fields=(("field1", Int32()), ("field2", Float64()))), "AQAAAAAAAAAAAPA/"), + (Struct(fields=(("field1", Float16()), ("field2", Int32()))), "AQAAAAAA"), ) scalar_v3_params = ( - (Structured(fields=(("field1", Int32()), ("field2", Float64()))), "AQAAAAAAAAAAAPA/"), - (Structured(fields=(("field1", Int64()), ("field2", Int32()))), "AQAAAAAAAAAAAPA/"), + ( + Struct(fields=(("field1", Int32()), ("field2", Float64()))), + {"field1": 1, "field2": 1.0}, + ), + (Struct(fields=(("field1", Int64()), ("field2", Int32()))), {"field1": 1, "field2": 1}), ) cast_value_params = ( ( - Structured(fields=(("field1", Int32()), ("field2", Float64()))), + Struct(fields=(("field1", Int32()), ("field2", Float64()))), (1, 2.0), np.array((1, 2.0), dtype=[("field1", np.int32), ("field2", np.float64)]), ), ( - Structured(fields=(("field1", Int64()), ("field2", Int32()))), + Struct(fields=(("field1", Int64()), ("field2", Int32()))), (3, 4.5), np.array((3, 4.5), dtype=[("field1", np.int64), ("field2", np.int32)]), ), ) item_size_params = ( - Structured(fields=(("field1", Int32()), ("field2", Float64()))), - Structured(fields=(("field1", Int64()), ("field2", Int32()))), + Struct(fields=(("field1", Int32()), ("field2", Float64()))), + Struct(fields=(("field1", Int64()), ("field2", Int32()))), ) invalid_scalar_params = ( - (Structured(fields=(("field1", Int32()), ("field2", Float64()))), "i am a string"), - (Structured(fields=(("field1", Int32()), ("field2", Float64()))), {"type": "dict"}), + (Struct(fields=(("field1", Int32()), ("field2", Float64()))), "i am a string"), + (Struct(fields=(("field1", Int32()), ("field2", Float64()))), {"type": "dict"}), ) def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool: @@ -114,11 +125,139 @@ def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool: return super().scalar_equals(scalar1, scalar2) +class TestStructured: + """Test the legacy 'structured' dtype format.""" + + def test_invalid_size(self) -> None: + """Test that it's impossible to create a data type that has no fields.""" + fields = () + msg = f"must have at least one field. Got {fields!r}" + with pytest.raises(ValueError, match=msg): + Structured(fields=fields) + + def test_structured_legacy_name_with_tuple_format(self) -> None: + """Test that the legacy 'structured' name with tuple field format is accepted.""" + json_v3 = { + "name": "structured", + "configuration": { + "fields": [ + ["field1", "int32"], + ["field2", "float64"], + ] + }, + } + dtype = Structured.from_json(json_v3, zarr_format=3) + assert dtype.fields[0][0] == "field1" + assert dtype.fields[1][0] == "field2" + + def test_structured_rejects_object_format(self) -> None: + """Test that 'structured' dtype rejects the new object field format.""" + json_v3 = { + "name": "structured", + "configuration": { + "fields": [ + {"name": "field1", "data_type": "int32"}, + {"name": "field2", "data_type": "float64"}, + ] + }, + } + with pytest.raises(DataTypeValidationError, match="Invalid field format for 'structured'"): + Structured.from_json(json_v3, zarr_format=3) + + @pytest.mark.filterwarnings("ignore::zarr.errors.UnstableSpecificationWarning") + def test_structured_writes_tuple_format(self) -> None: + """Test that 'structured' writes the tuple field format.""" + dtype = Structured(fields=(("field1", Int32()), ("field2", Float64()))) + json_v3 = dtype.to_json(zarr_format=3) + assert json_v3["name"] == "structured" + assert json_v3["configuration"]["fields"][0] == ["field1", "int32"] + + def test_structured_no_native_dtype_matching(self) -> None: + dtype = np.dtype([("field1", np.int32), ("field2", np.float64)]) + with pytest.raises(DataTypeValidationError, match="Use 'Struct' for native dtype matching"): + Structured.from_native_dtype(dtype) + + def test_invalid_size() -> None: - """ - Test that it's impossible to create a data type that has no fields - """ + """Test that it's impossible to create a data type that has no fields.""" fields = () msg = f"must have at least one field. Got {fields!r}" with pytest.raises(ValueError, match=msg): - Structured(fields=fields) + Struct(fields=fields) + + +@pytest.mark.filterwarnings("ignore::zarr.errors.UnstableSpecificationWarning") +def test_struct_name_is_primary() -> None: + """Test that 'struct' is the primary name written to JSON.""" + dtype = Struct(fields=(("field1", Int32()), ("field2", Float64()))) + json_v3 = dtype.to_json(zarr_format=3) + assert json_v3["name"] == "struct" + + +def test_struct_rejects_tuple_format() -> None: + """Test that 'struct' dtype rejects the legacy tuple field format.""" + json_v3 = { + "name": "struct", + "configuration": { + "fields": [ + ["field1", "int32"], + ["field2", "float64"], + ] + }, + } + with pytest.raises(DataTypeValidationError, match="Invalid field format for 'struct'"): + Struct.from_json(json_v3, zarr_format=3) + + +def test_fill_value_dict_form() -> None: + """Test that dict form fill values are properly parsed.""" + dtype = Struct(fields=(("x", Int32()), ("y", Float64()))) + fill_value = dtype.from_json_scalar({"x": 42, "y": 3.14}, zarr_format=3) + assert fill_value["x"] == 42 + assert fill_value["y"] == 3.14 + + +def test_fill_value_dict_form_missing_fields() -> None: + """Test that missing fields in dict form fill values use defaults.""" + dtype = Struct(fields=(("x", Int32()), ("y", Float64()))) + fill_value = dtype.from_json_scalar({"x": 42}, zarr_format=3) + assert fill_value["x"] == 42 + assert fill_value["y"] == 0.0 + + +def test_fill_value_legacy_base64() -> None: + """Test that legacy base64-encoded fill values are still readable.""" + dtype = Struct(fields=(("field1", Int32()), ("field2", Float64()))) + fill_value = dtype.from_json_scalar("AQAAAAAAAAAAAPA/", zarr_format=3) + assert fill_value["field1"] == 1 + assert fill_value["field2"] == 1.0 + + +def test_fill_value_to_json_dict_form() -> None: + """Test that fill values are serialized as dict form.""" + dtype = Struct(fields=(("x", Int32()), ("y", Float64()))) + scalar = np.array((42, 3.14), dtype=[("x", np.int32), ("y", np.float64)])[()] + json_val = dtype.to_json_scalar(scalar, zarr_format=3) + assert isinstance(json_val, dict) + assert json_val["x"] == 42 + assert json_val["y"] == 3.14 + + +def test_has_multi_byte_fields_true() -> None: + """Test that has_multi_byte_fields returns True for dtypes with multi-byte fields.""" + dtype = Struct(fields=(("field1", Int32()), ("field2", Float64()))) + assert dtype.has_multi_byte_fields() is True + + +def test_has_multi_byte_fields_false() -> None: + """Test that has_multi_byte_fields returns False for dtypes with only single-byte fields.""" + dtype = Struct(fields=(("field1", UInt8()), ("field2", UInt8()))) + assert dtype.has_multi_byte_fields() is False + + +def test_struct_from_native_dtype() -> None: + """Test that Struct can be created from native numpy dtype.""" + dtype = np.dtype([("field1", np.int32), ("field2", np.float64)]) + struct = Struct.from_native_dtype(dtype) + assert struct.fields[0][0] == "field1" + assert struct.fields[1][0] == "field2" diff --git a/tests/test_v2.py b/tests/test_v2.py index cb990f6159..3a063ac509 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -14,8 +14,9 @@ from zarr import config from zarr.abc.store import Store from zarr.core.buffer.core import default_buffer_prototype -from zarr.core.dtype import FixedLengthUTF32, Structured, VariableLengthUTF8 +from zarr.core.dtype import FixedLengthUTF32, VariableLengthUTF8 from zarr.core.dtype.npy.bytes import NullTerminatedBytes +from zarr.core.dtype.npy.structured import Struct from zarr.core.dtype.wrapper import ZDType from zarr.core.group import Group from zarr.core.sync import sync @@ -283,7 +284,7 @@ def test_structured_dtype_roundtrip(fill_value: float | bytes, tmp_path: Path) - def test_parse_structured_fill_value_valid( fill_value: Any, dtype: np.dtype[Any], expected_result: Any ) -> None: - zdtype = Structured.from_native_dtype(dtype) + zdtype = Struct.from_native_dtype(dtype) result = zdtype.cast_scalar(fill_value) assert result.dtype == expected_result.dtype assert result == expected_result