diff --git a/docs/devnotes/feat-v3-scale-offset-cast.md b/docs/devnotes/feat-v3-scale-offset-cast.md new file mode 100644 index 0000000000..3236a56318 --- /dev/null +++ b/docs/devnotes/feat-v3-scale-offset-cast.md @@ -0,0 +1,179 @@ +# scale_offset and cast_value codecs + +Source: https://github.com/zarr-developers/zarr-extensions/pull/43 + +## Overview + +Two array-to-array codecs for zarr v3, designed to work together for the +common pattern of storing floating-point data as compressed integers. + +--- + +## scale_offset + +**Type:** array -> array (does NOT change dtype) + +**Encode:** `out = (in - offset) * scale` +**Decode:** `out = (in / scale) + offset` + +### Parameters +- `offset` (optional, float): scalar subtracted during encoding. Default: 0. +- `scale` (optional, float): scalar multiplied during encoding (after offset subtraction). Default: 1. + +### Key rules +- Arithmetic uses the input array's own data type semantics (no implicit promotion). +- If neither scale nor offset is given, `configuration` may be omitted (codec is a no-op). +- Fill value is transformed through the codec (encode direction). +- Only valid for real-number data types (int/uint/float families). Complex dtypes are rejected at validation time. + +### JSON +```json +{"name": "scale_offset", "configuration": {"offset": 5, "scale": 0.1}} +``` +When both offset and scale are defaults: `{"name": "scale_offset"}` (no configuration key). + +--- + +## cast_value + +**Type:** array -> array (CHANGES dtype) + +**Purpose:** Value-convert (not binary-reinterpret) array elements to a new data type. + +### Parameters +- `data_type` (required): target zarr v3 data type name (e.g. `"uint8"`, `"float32"`). + Internally stored as a `ZDType` instance, resolved via `get_data_type_from_json`. +- `rounding` (optional): how to round when casting float to int. + Values: `"nearest-even"` (default), `"towards-zero"`, `"towards-positive"`, + `"towards-negative"`, `"nearest-away"`. +- `out_of_range` (optional): what to do when a value is outside the target's range. + Values: `"clamp"`, `"wrap"`. If absent, out-of-range values raise an error. + `"wrap"` is only valid for integer target types. +- `scalar_map` (optional): explicit value overrides. + `{"encode": [[input, output], ...], "decode": [[input, output], ...]}`. + Applied BEFORE rounding/out_of_range. Each entry's source is deserialized using the + source dtype and target using the target dtype (via `ZDType.from_json_scalar`), + preserving full precision for both sides. + +### Cast procedure (`_cast_array_impl`) + +Dispatches on `(src_type, tgt_type, has_map)` where src/tgt are `"int"` or `"float"`: + +| Source | Target | scalar_map | Procedure | +|--------|--------|------------|-----------| +| any | float | no | `arr.astype(target_dtype)` | +| int | float | yes | widen to float64, apply map, cast | +| float | float | yes | copy, apply map, cast | +| int | int | no | range check, then astype | +| int | int | yes | widen to int64, apply map, range check | +| float | int | any | widen to float64, apply map (if any), reject NaN/Inf, round, range check | + +All casts are wrapped in `np.errstate(over='raise', invalid='raise')` to convert +numpy overflow/invalid warnings to hard errors. + +### Validation checks +- Only integer and floating-point dtypes are allowed (both source and target). +- `out_of_range='wrap'` is rejected for non-integer target types. +- Int-to-float casts are rejected if the float type's mantissa cannot exactly represent + the full integer range (e.g. int64 -> float64 is rejected because float64 has only + 52 mantissa bits, but int64 has values up to 2^63-1). Same check applies for the + float-to-int decode direction. + +### Special values +- NaN: detected dynamically via `isinstance(src, (float, np.floating)) and np.isnan(src)`. + NaN-to-integer casts error unless `scalar_map` provides a mapping. + Hex-encoded NaN strings (e.g. `"0x7fc00001"`) preserve NaN payloads per the zarr v3 spec. +- `_check_int_range` handles out-of-range integer values with clamp (via `np.clip`) or + wrap (via modular arithmetic). + +### Fill value +- Cast using the same `_cast_array` path as array elements, including scalar_map and rounding. +- Done in `resolve_metadata`, which also changes the chunk spec's dtype to the target. + +### JSON +```json +{ + "name": "cast_value", + "configuration": { + "data_type": "uint8", + "rounding": "nearest-even", + "out_of_range": "clamp", + "scalar_map": { + "encode": [["NaN", 0], ["+Infinity", 0], ["-Infinity", 0]], + "decode": [[0, "NaN"]] + } + } +} +``` +Only non-default fields are serialized (rounding and out_of_range are omitted when default). + +--- + +## Typical combined usage + +```json +{ + "data_type": "float64", + "fill_value": "NaN", + "codecs": [ + {"name": "scale_offset", "configuration": {"offset": -10, "scale": 0.1}}, + {"name": "cast_value", "configuration": { + "data_type": "uint8", + "rounding": "nearest-even", + "scalar_map": {"encode": [["NaN", 0]], "decode": [[0, "NaN"]]} + }}, + "bytes" + ] +} +``` + +--- + +## Implementation notes + +### Module structure +- `src/zarr/codecs/scale_offset.py` — `ScaleOffset` class +- `src/zarr/codecs/cast_value.py` — `CastValue` class and casting helpers +- `tests/test_codecs/test_scale_offset.py` — ScaleOffset tests +- `tests/test_codecs/test_cast_value.py` — CastValue tests + combined pipeline tests + +### scale_offset +- `@dataclass(kw_only=True, frozen=True)`, subclasses `ArrayArrayCodec`. +- Uses `ScaleOffsetJSON` (a `NamedConfig` TypedDict) for typed serialization. +- `from_dict` uses `parse_named_configuration(data, "scale_offset", require_configuration=False)`. +- `to_dict` omits the `configuration` key entirely when both offset=0 and scale=1. +- `resolve_metadata`: transforms fill_value via `(fill - offset) * scale`, dtype unchanged. +- `_encode_sync`: `(arr - offset) * scale` using the array's own dtype. +- `_decode_sync`: `(arr / scale) + offset` using the array's own dtype. +- `is_fixed_size = True`, `compute_encoded_size` returns input size unchanged. + +### cast_value +- `@dataclass(frozen=True)` with custom `__init__` (accepts `data_type: str | ZDType`). +- Stores `dtype: ZDType` (not a string). String data_type is resolved via `get_data_type_from_json`. +- `from_dict` uses `parse_named_configuration(data, "cast_value", require_configuration=True)`. +- `to_dict` serializes dtype via `self.dtype.to_json(zarr_format=3)`, only includes + non-default rounding/out_of_range/scalar_map. +- `resolve_metadata`: casts fill value, changes chunk spec dtype to target. +- `_encode_sync` / `_decode_sync`: delegate to `_cast_array`, threading the appropriate + scalar_map direction ("encode" or "decode") and the correct src/tgt ZDType pair for + scalar map deserialization. +- `compute_encoded_size`: scales by `target_itemsize / source_itemsize`. + +### Key helpers (cast_value.py) +- `_cast_array` — public entry point, wraps `_cast_array_impl` with `np.errstate`. +- `_cast_array_impl` — match-based dispatch on `(src_type, tgt_type, has_map)`. +- `_check_int_range` — integer range check with clamp/wrap/error. +- `_round_inplace` — rounding dispatch (rint, trunc, ceil, floor, nearest-away). +- `_apply_scalar_map` — in-place value remapping with NaN-aware matching. +- `_parse_map_entries` — deserializes scalar_map JSON using separate src/tgt ZDType instances. +- `_extract_raw_map` — extracts "encode" or "decode" direction from ScalarMapJSON. + +### Key design decisions +1. Encode = `(in - offset) * scale` (subtract, not add) — matches HDF5 and numcodecs. +2. No implicit precision promotion — arithmetic stays in the input dtype. +3. `out_of_range` defaults to error (not clamp). +4. `scalar_map` entries are typed: each side is deserialized with its own ZDType, + so int64 scalars don't lose precision through float64 intermediaries. +5. Fill value is cast through the same `_cast_array` path as data elements. +6. Int-to-float precision loss is caught at validate time (mantissa bit check). +7. Runtime overflow/invalid is caught via `np.errstate(over='raise', invalid='raise')`. diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index 4c621290e7..04b31d0d5f 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -2,6 +2,7 @@ from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle from zarr.codecs.bytes import BytesCodec, Endian +from zarr.codecs.cast_value import CastValue from zarr.codecs.crc32c_ import Crc32cCodec from zarr.codecs.gzip import GzipCodec from zarr.codecs.numcodecs import ( @@ -27,6 +28,7 @@ Zlib, Zstd, ) +from zarr.codecs.scale_offset import ScaleOffset from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation from zarr.codecs.transpose import TransposeCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec @@ -38,9 +40,11 @@ "BloscCodec", "BloscShuffle", "BytesCodec", + "CastValue", "Crc32cCodec", "Endian", "GzipCodec", + "ScaleOffset", "ShardingCodec", "ShardingCodecIndexLocation", "TransposeCodec", @@ -61,6 +65,8 @@ register_codec("vlen-utf8", VLenUTF8Codec) register_codec("vlen-bytes", VLenBytesCodec) register_codec("transpose", TransposeCodec) +register_codec("scale_offset", ScaleOffset) +register_codec("cast_value", CastValue) # Register all the codecs formerly contained in numcodecs.zarr3 diff --git a/src/zarr/codecs/cast_value.py b/src/zarr/codecs/cast_value.py new file mode 100644 index 0000000000..ee1b2afbef --- /dev/null +++ b/src/zarr/codecs/cast_value.py @@ -0,0 +1,417 @@ +from __future__ import annotations + +from dataclasses import dataclass, replace +from typing import TYPE_CHECKING, Any, Literal, NotRequired, TypeAlias, TypedDict, cast + +import numpy as np + +from zarr.abc.codec import ArrayArrayCodec +from zarr.core.common import JSON, parse_named_configuration +from zarr.core.dtype import get_data_type_from_json + +if TYPE_CHECKING: + from typing import Self + + from zarr.core.array_spec import ArraySpec + from zarr.core.buffer import NDBuffer + from zarr.core.chunk_grids import ChunkGrid + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + +NumericScalar: TypeAlias = np.integer[Any] | np.floating[Any] + +RoundingMode = Literal[ + "nearest-even", + "towards-zero", + "towards-positive", + "towards-negative", + "nearest-away", +] + +OutOfRangeMode = Literal["clamp", "wrap"] + + +class ScalarMapJSON(TypedDict): + encode: NotRequired[tuple[tuple[object, object]]] + decode: NotRequired[tuple[tuple[object, object]]] + + +# Pre-parsed scalar map entry: (source_scalar, target_scalar) +_MapEntry = tuple[NumericScalar, NumericScalar] + + +def _parse_map_entries( + mapping: dict[str, str], + src_dtype: ZDType[TBaseDType, TBaseScalar], + tgt_dtype: ZDType[TBaseDType, TBaseScalar], +) -> list[_MapEntry]: + """Pre-parse a scalar map dict into a list of (src, tgt) tuples. + + Each entry's source value is deserialized using ``src_dtype`` and its target + value using ``tgt_dtype``, preserving full precision for both data types. + """ + entries: list[_MapEntry] = [] + for src_str, tgt_str in mapping.items(): + src = src_dtype.from_json_scalar(src_str, zarr_format=3) + tgt = tgt_dtype.from_json_scalar(tgt_str, zarr_format=3) + entries.append((src, tgt)) # type: ignore[arg-type] + return entries + + +def _apply_scalar_map(work: np.ndarray[Any, np.dtype[Any]], entries: list[_MapEntry]) -> None: + """Apply scalar map entries in-place. Single pass per entry.""" + for src, tgt in entries: + if isinstance(src, (float, np.floating)) and np.isnan(src): + mask = np.isnan(work) + else: + mask = work == src + work[mask] = tgt + + +def _round_inplace( + arr: np.ndarray[Any, np.dtype[Any]], mode: RoundingMode +) -> np.ndarray[Any, np.dtype[Any]]: + """Round array, returning result (may or may not be a new array). + + For nearest-away, requires 3 numpy ops. All others are a single op. + """ + match mode: + case "nearest-even": + return np.rint(arr) # type: ignore [no-any-return] + case "towards-zero": + return np.trunc(arr) # type: ignore [no-any-return] + case "towards-positive": + return np.ceil(arr) # type: ignore [no-any-return] + case "towards-negative": + return np.floor(arr) # type: ignore [no-any-return] + case "nearest-away": + return np.sign(arr) * np.floor(np.abs(arr) + 0.5) # type: ignore [no-any-return] + raise ValueError(f"Unknown rounding mode: {mode}") + + +def _cast_array( + arr: np.ndarray[Any, np.dtype[Any]], + *, + target_dtype: np.dtype[Any], + rounding_mode: RoundingMode, + out_of_range_mode: OutOfRangeMode | None, + scalar_map_entries: list[_MapEntry] | None, +) -> np.ndarray[Any, np.dtype[Any]]: + """Cast an array to target_dtype with rounding, out-of-range, and scalar_map handling. + + Optimized to minimize allocations and passes over the data. + For the simple case (no scalar_map, no rounding needed, no out-of-range), + this is essentially just ``arr.astype(target_dtype)``. + + All casts are performed under ``np.errstate(over='raise', invalid='raise')`` + so that numpy overflow or invalid-value warnings become hard errors instead + of being silently swallowed. + """ + with np.errstate(over="raise", invalid="raise"): + return _cast_array_impl( + arr, + target_dtype=target_dtype, + rounding=rounding_mode, + out_of_range=out_of_range_mode, + scalar_map_entries=scalar_map_entries, + ) + + +def _check_int_range( + work: np.ndarray[Any, np.dtype[Any]], + *, + target_dtype: np.dtype[Any], + out_of_range: OutOfRangeMode | None, +) -> np.ndarray[Any, np.dtype[Any]]: + """Check integer range and apply out-of-range handling, then cast.""" + info = np.iinfo(target_dtype) + lo, hi = int(info.min), int(info.max) + w_min, w_max = int(work.min()), int(work.max()) + if w_min >= lo and w_max <= hi: + return work.astype(target_dtype) + match out_of_range: + case "clamp": + return np.clip(work, lo, hi).astype(target_dtype) + case "wrap": + range_size = hi - lo + 1 + return ((work.astype(np.int64) - lo) % range_size + lo).astype(target_dtype) + case None: + oor_vals = work[(work < lo) | (work > hi)] + raise ValueError( + f"Values out of range for {target_dtype} (valid range: [{lo}, {hi}]), " + f"got values in [{w_min}, {w_max}]. " + f"Out-of-range values: {oor_vals.ravel()!r}. " + f"Set out_of_range='clamp' or out_of_range='wrap' to handle this." + ) + + +def _cast_array_impl( + arr: np.ndarray[Any, np.dtype[Any]], + *, + target_dtype: np.dtype[Any], + rounding: RoundingMode, + out_of_range: OutOfRangeMode | None, + scalar_map_entries: list[_MapEntry] | None, +) -> np.ndarray[Any, np.dtype[Any]]: + src_type: Literal["int", "float"] = "int" if np.issubdtype(arr.dtype, np.integer) else "float" + tgt_type: Literal["int", "float"] = ( + "int" if np.issubdtype(target_dtype, np.integer) else "float" + ) + has_map = bool(scalar_map_entries) + + match (src_type, tgt_type, has_map): + # float→float or int→float without scalar_map — single astype + case (_, "float", False): + return arr.astype(target_dtype) + + # int→float with scalar_map — widen to float64, apply map, cast + case ("int", "float", True): + work = arr.astype(np.float64) + _apply_scalar_map(work, scalar_map_entries) # type: ignore[arg-type] + return work.astype(target_dtype) + + # float→float with scalar_map — copy, apply map, cast + case ("float", "float", True): + work = arr.copy() + _apply_scalar_map(work, scalar_map_entries) # type: ignore[arg-type] + return work.astype(target_dtype) + + # int→int without scalar_map — range check then astype + case ("int", "int", False): + if arr.dtype.itemsize > target_dtype.itemsize or arr.dtype != target_dtype: + return _check_int_range(arr, target_dtype=target_dtype, out_of_range=out_of_range) + return arr.astype(target_dtype) + + # int→int with scalar_map — widen to int64, apply map, range check + case ("int", "int", True): + work = arr.astype(np.int64) + _apply_scalar_map(work, scalar_map_entries) # type: ignore[arg-type] + return _check_int_range(work, target_dtype=target_dtype, out_of_range=out_of_range) + + # float→int (with or without scalar_map) — rounding + range check + case ("float", "int", _): + if arr.dtype != np.float64: + work = arr.astype(np.float64) + else: + work = arr.copy() + + if scalar_map_entries: + _apply_scalar_map(work, scalar_map_entries) + + bad = np.isnan(work) | np.isinf(work) + if bad.any(): + raise ValueError("Cannot cast NaN or Infinity to integer type without scalar_map") + + work = _round_inplace(work, rounding) + return _check_int_range(work, target_dtype=target_dtype, out_of_range=out_of_range) + + raise AssertionError( + f"Unhandled type combination: src={src_type}, tgt={tgt_type}" + ) # pragma: no cover + + +def _extract_raw_map(data: ScalarMapJSON | None, direction: str) -> dict[str, str] | None: + """Extract raw string mapping from scalar_map JSON for 'encode' or 'decode'.""" + if data is None: + return None + raw: dict[str, str] = {} + pairs = data.get(direction, []) + for src, tgt in pairs: # type: ignore[attr-defined] + raw[str(src)] = str(tgt) + return raw or None + + +@dataclass(frozen=True) +class CastValue(ArrayArrayCodec): + """Cast-value array-to-array codec. + + Value-converts array elements to a new data type during encoding, + and back to the original data type during decoding. + + Parameters + ---------- + data_type : str + Target zarr v3 data type name (e.g. "uint8", "float32"). + rounding : RoundingMode + How to round when exact representation is impossible. Default is "nearest-even". + out_of_range : OutOfRangeMode or None + What to do when a value is outside the target's range. + None means error. "clamp" clips to range. "wrap" uses modular arithmetic + (only valid for integer types). + scalar_map : dict or None + Explicit value overrides as JSON: {"encode": [[src, tgt], ...], "decode": [[src, tgt], ...]}. + """ + + is_fixed_size = True + + dtype: ZDType[TBaseDType, TBaseScalar] + rounding: RoundingMode + out_of_range: OutOfRangeMode | None + scalar_map: ScalarMapJSON | None + + def __init__( + self, + *, + data_type: str | ZDType[TBaseDType, TBaseScalar], + rounding: RoundingMode = "nearest-even", + out_of_range: OutOfRangeMode | None = None, + scalar_map: ScalarMapJSON | None = None, + ) -> None: + if isinstance(data_type, str): + dtype = get_data_type_from_json(data_type, zarr_format=3) + else: + dtype = data_type + object.__setattr__(self, "dtype", dtype) + object.__setattr__(self, "rounding", rounding) + object.__setattr__(self, "out_of_range", out_of_range) + object.__setattr__(self, "scalar_map", scalar_map) + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + _, configuration_parsed = parse_named_configuration( + data, "cast_value", require_configuration=True + ) + return cls(**configuration_parsed) # type: ignore[arg-type] + + def to_dict(self) -> dict[str, JSON]: + config: dict[str, JSON] = {"data_type": cast(JSON, self.dtype.to_json(zarr_format=3))} + if self.rounding != "nearest-even": + config["rounding"] = self.rounding + if self.out_of_range is not None: + config["out_of_range"] = self.out_of_range + if self.scalar_map is not None: + config["scalar_map"] = cast(JSON, self.scalar_map) + return {"name": "cast_value", "configuration": config} + + def validate( + self, + *, + shape: tuple[int, ...], + dtype: ZDType[TBaseDType, TBaseScalar], + chunk_grid: ChunkGrid, + ) -> None: + source_native = dtype.to_native_dtype() + target_native = self.dtype.to_native_dtype() + for label, dt in [("source", source_native), ("target", target_native)]: + if not np.issubdtype(dt, np.integer) and not np.issubdtype(dt, np.floating): + raise ValueError( + f"cast_value codec only supports integer and floating-point data types. " + f"Got {label} dtype {dt}." + ) + if self.out_of_range == "wrap" and not np.issubdtype(target_native, np.integer): + raise ValueError("out_of_range='wrap' is only valid for integer target types.") + # Check that int→float casts won't silently lose precision. + # A float type with `m` mantissa bits can exactly represent all integers + # in [-2**m, 2**m]. If the integer type's range exceeds that, the cast is lossy. + if np.issubdtype(source_native, np.integer) and np.issubdtype(target_native, np.floating): + int_info = np.iinfo(source_native) # type: ignore[type-var] + mantissa_bits = np.finfo(target_native).nmant # type: ignore[arg-type] + max_exact_int = 2**mantissa_bits + if int_info.max > max_exact_int or int_info.min < -max_exact_int: + raise ValueError( + f"Casting {source_native} to {target_native} may silently lose precision. " + f"{target_native} can only exactly represent integers up to 2**{mantissa_bits} " + f"({max_exact_int}), but {source_native} has range " + f"[{int_info.min}, {int_info.max}]." + ) + # Same check for float→int decode direction + if np.issubdtype(target_native, np.integer) and np.issubdtype(source_native, np.floating): + int_info = np.iinfo(target_native) # type: ignore[type-var] + mantissa_bits = np.finfo(source_native).nmant # type: ignore[arg-type] + max_exact_int = 2**mantissa_bits + if int_info.max > max_exact_int or int_info.min < -max_exact_int: + raise ValueError( + f"Casting {source_native} to {target_native} may silently lose precision. " + f"{source_native} can only exactly represent integers up to 2**{mantissa_bits} " + f"({max_exact_int}), but {target_native} has range " + f"[{int_info.min}, {int_info.max}]." + ) + + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + target_zdtype = self.dtype + target_native = target_zdtype.to_native_dtype() + source_native = chunk_spec.dtype.to_native_dtype() + + fill = chunk_spec.fill_value + fill_arr = np.array([fill], dtype=source_native) + + encode_raw = _extract_raw_map(self.scalar_map, "encode") + encode_entries = ( + _parse_map_entries(encode_raw, chunk_spec.dtype, self.dtype) if encode_raw else None + ) + + new_fill_arr = _cast_array( + fill_arr, + target_dtype=target_native, + rounding_mode=self.rounding, + out_of_range_mode=self.out_of_range, + scalar_map_entries=encode_entries, + ) + new_fill = target_native.type(new_fill_arr[0]) + + return replace(chunk_spec, dtype=target_zdtype, fill_value=new_fill) + + def _encode_sync( + self, + chunk_array: NDBuffer, + _chunk_spec: ArraySpec, + ) -> NDBuffer | None: + arr = chunk_array.as_ndarray_like() + target_native = self.dtype.to_native_dtype() + + encode_raw = _extract_raw_map(self.scalar_map, "encode") + encode_entries = ( + _parse_map_entries(encode_raw, _chunk_spec.dtype, self.dtype) if encode_raw else None + ) + + result = _cast_array( + np.asarray(arr), + target_dtype=target_native, + rounding_mode=self.rounding, + out_of_range_mode=self.out_of_range, + scalar_map_entries=encode_entries, + ) + return chunk_array.__class__.from_ndarray_like(result) + + async def _encode_single( + self, + chunk_array: NDBuffer, + chunk_spec: ArraySpec, + ) -> NDBuffer | None: + return self._encode_sync(chunk_array, chunk_spec) + + def _decode_sync( + self, + chunk_array: NDBuffer, + chunk_spec: ArraySpec, + ) -> NDBuffer: + arr = chunk_array.as_ndarray_like() + target_native = chunk_spec.dtype.to_native_dtype() + + decode_raw = _extract_raw_map(self.scalar_map, "decode") + decode_entries = ( + _parse_map_entries(decode_raw, self.dtype, chunk_spec.dtype) if decode_raw else None + ) + + result = _cast_array( + np.asarray(arr), + target_dtype=target_native, + rounding_mode=self.rounding, + out_of_range_mode=self.out_of_range, + scalar_map_entries=decode_entries, + ) + return chunk_array.__class__.from_ndarray_like(result) + + async def _decode_single( + self, + chunk_array: NDBuffer, + chunk_spec: ArraySpec, + ) -> NDBuffer: + return self._decode_sync(chunk_array, chunk_spec) + + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + source_itemsize = chunk_spec.dtype.to_native_dtype().itemsize + target_itemsize = self.dtype.to_native_dtype().itemsize + if source_itemsize == 0: + return 0 + num_elements = input_byte_length // source_itemsize + return num_elements * target_itemsize diff --git a/src/zarr/codecs/scale_offset.py b/src/zarr/codecs/scale_offset.py new file mode 100644 index 0000000000..a5037fdbc0 --- /dev/null +++ b/src/zarr/codecs/scale_offset.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +from dataclasses import dataclass, field, replace +from typing import TYPE_CHECKING, Literal, NotRequired + +import numpy as np +from typing_extensions import TypedDict + +from zarr.abc.codec import ArrayArrayCodec +from zarr.core.common import JSON, NamedConfig, parse_named_configuration + +if TYPE_CHECKING: + from typing import Self + + from zarr.core.array_spec import ArraySpec + from zarr.core.buffer import NDBuffer + from zarr.core.chunk_grids import ChunkGrid + from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType + + +class ScaleOffsetConfig(TypedDict, closed=True): # type: ignore[call-arg] + scale: NotRequired[JSON] + offset: NotRequired[JSON] + + +ScaleOffsetName = Literal["scale_offset"] + + +class ScaleOffsetJSON(NamedConfig[ScaleOffsetName, ScaleOffsetConfig]): + """The JSON form(s) of the `scale_offset` codec""" + + +@dataclass(kw_only=True, frozen=True) +class ScaleOffset(ArrayArrayCodec): + """Scale-offset array-to-array codec. + + Encodes values by subtracting an offset and multiplying by a scale factor. + Decodes by dividing by the scale and adding the offset. + + All arithmetic uses the input array's data type semantics. + + Attributes + ---------- + offset : float + Value subtracted during encoding. Default is 0. + scale : float + Value multiplied during encoding (after offset subtraction). Default is 1. + """ + + is_fixed_size: bool = field(default=True, init=False) + + offset: float = 0 + scale: float = 1 + + @classmethod + def from_dict(cls, data: ScaleOffsetJSON) -> Self: # type: ignore[override] + _, configuration_parsed = parse_named_configuration( + data, "scale_offset", require_configuration=False + ) + if configuration_parsed is None: + return cls() + return cls(**configuration_parsed) # type: ignore[arg-type] + + def to_dict(self) -> ScaleOffsetJSON: # type: ignore[override] + if self.offset == 0 and self.scale == 1: + return {"name": "scale_offset"} + config: ScaleOffsetConfig = {} # + if self.offset != 0: + config["offset"] = self.offset + if self.scale != 1: + config["scale"] = self.scale + return {"name": "scale_offset", "configuration": config} + + def validate( + self, + *, + shape: tuple[int, ...], + dtype: ZDType[TBaseDType, TBaseScalar], + chunk_grid: ChunkGrid, + ) -> None: + native = dtype.to_native_dtype() + if not np.issubdtype(native, np.integer) and not np.issubdtype(native, np.floating): + raise ValueError( + f"scale_offset codec only supports integer and floating-point data types. " + f"Got {dtype}." + ) + + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + """ + Define the effect of this codec on the spec for an array. The only change is to update + the output fill value by applying the scale + offset transformation. + """ + native_dtype = chunk_spec.dtype.to_native_dtype() + fill = chunk_spec.fill_value + new_fill = ( + np.dtype(native_dtype).type(fill) - native_dtype.type(self.offset) # type: ignore[operator] + ) * native_dtype.type(self.scale) + return replace(chunk_spec, fill_value=new_fill) + + def _encode_sync( + self, + chunk_array: NDBuffer, + _chunk_spec: ArraySpec, + ) -> NDBuffer | None: + arr = chunk_array.as_ndarray_like() + result = (arr - arr.dtype.type(self.offset)) * arr.dtype.type(self.scale) + return chunk_array.__class__.from_ndarray_like(result) + + async def _encode_single( + self, + chunk_data: NDBuffer, + chunk_spec: ArraySpec, + ) -> NDBuffer | None: + return self._encode_sync(chunk_data, chunk_spec) + + def _decode_sync( + self, + chunk_array: NDBuffer, + _chunk_spec: ArraySpec, + ) -> NDBuffer: + arr = chunk_array.as_ndarray_like() + result = (arr / arr.dtype.type(self.scale)) + arr.dtype.type(self.offset) + return chunk_array.__class__.from_ndarray_like(result) + + async def _decode_single( + self, + chunk_data: NDBuffer, + chunk_spec: ArraySpec, + ) -> NDBuffer: + return self._decode_sync(chunk_data, chunk_spec) + + def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: + return input_byte_length diff --git a/tests/test_codecs/test_cast_value.py b/tests/test_codecs/test_cast_value.py new file mode 100644 index 0000000000..e57772480a --- /dev/null +++ b/tests/test_codecs/test_cast_value.py @@ -0,0 +1,524 @@ +"""Tests for the cast_value codec.""" + +from __future__ import annotations + +import numpy as np +import pytest + +import zarr +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.cast_value import CastValue +from zarr.codecs.scale_offset import ScaleOffset +from zarr.core.dtype import get_data_type_from_json +from zarr.storage import MemoryStore + + +class TestCastValueCodec: + """Tests for the cast_value codec.""" + + def test_float64_to_float32(self) -> None: + """Cast float64 to float32 and back.""" + store = MemoryStore() + data = np.array([1.0, 2.0, 3.0], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(3,), + codecs=[CastValue(data_type="float32"), BytesCodec()], + ) + arr[:] = data + result = arr[:] + np.testing.assert_allclose(result, data) # type: ignore[arg-type] + + def test_float64_to_int32_towards_zero(self) -> None: + """Cast float64 to int32 with towards-zero rounding.""" + store = MemoryStore() + data = np.array([1.7, -1.7, 2.3, -2.3], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(4,), + codecs=[CastValue(data_type="int32", rounding="towards-zero"), BytesCodec()], + ) + arr[:] = data + result = arr[:] + # After encoding to int32 with towards-zero: [1, -1, 2, -2] + # After decoding back to float64: [1.0, -1.0, 2.0, -2.0] + np.testing.assert_array_equal(result, [1.0, -1.0, 2.0, -2.0]) + + def test_float64_to_uint8_clamp(self) -> None: + """Cast float64 to uint8 with clamping out-of-range values.""" + store = MemoryStore() + data = np.array([0.0, 128.0, 300.0, -10.0], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(4,), + codecs=[ + CastValue(data_type="uint8", rounding="nearest-even", out_of_range="clamp"), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, [0.0, 128.0, 255.0, 0.0]) + + def test_float64_to_int8_wrap(self) -> None: + """Cast float64 to int8 with wrapping for out-of-range values.""" + store = MemoryStore() + data = np.array([200.0], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(1,), + codecs=[ + CastValue(data_type="int8", rounding="nearest-even", out_of_range="wrap"), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + # 200 wraps in int8 range [-128, 127]: (200 - (-128)) % 256 + (-128) = 328 % 256 - 128 = 72 - 128 = -56 + # Decode: -56 cast back to float64 = -56.0 + np.testing.assert_array_equal(result, [-56.0]) + + def test_nan_to_integer_without_scalar_map_errors(self) -> None: + """NaN cast to integer without scalar_map should raise.""" + store = MemoryStore() + data = np.array([float("nan")], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(1,), + codecs=[CastValue(data_type="uint8", out_of_range="clamp"), BytesCodec()], + ) + with pytest.raises(ValueError, match="Cannot cast NaN"): + arr[:] = data + + def test_nan_scalar_map(self) -> None: + """NaN should be mapped via scalar_map when provided.""" + store = MemoryStore() + data = np.array([1.0, float("nan"), 3.0], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(3,), + codecs=[ + CastValue( + data_type="uint8", + out_of_range="clamp", + scalar_map={ # type: ignore[arg-type] + "encode": [["NaN", 0]], + "decode": [[0, "NaN"]], + }, + ), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + assert result[0] == 1.0 # type: ignore[index] + assert np.isnan(result[1]) # type: ignore[index] + assert result[2] == 3.0 # type: ignore[index] + + def test_hex_nan_scalar_map(self) -> None: + """Hex-encoded NaN values in scalar_map should round-trip correctly. + + The hex string encoding is used for preserving specific NaN payloads + per the Zarr v3 spec. + """ + import struct + + # 0x7fc00001 is a NaN with a non-default payload in float32 + hex_nan = "0x7fc00001" + nan_bytes = bytes.fromhex("7fc00001") + nan_f32 = np.float32(struct.unpack(">f", nan_bytes)[0]) + assert np.isnan(nan_f32) + + store = MemoryStore() + data = np.array([1.0, nan_f32, 3.0], dtype="float32") + arr = zarr.create( + store=store, + shape=data.shape, + dtype="float32", + chunks=(3,), + codecs=[ + CastValue( + data_type="uint8", + out_of_range="clamp", + scalar_map={ # type: ignore[arg-type] + "encode": [[hex_nan, 255]], + "decode": [[255, hex_nan]], + }, + ), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + assert result[0] == np.float32(1.0) # type: ignore[index] + assert np.isnan(result[1]) # type: ignore[index] + assert result[2] == np.float32(3.0) # type: ignore[index] + + # Verify the NaN payload is preserved by checking the bit pattern + result_bytes = struct.pack(">f", result[1]) # type: ignore[index] + assert result_bytes == nan_bytes + + def test_int64_to_float64_precision_loss_rejected(self) -> None: + """Casting int64 to float64 is rejected because float64 cannot + exactly represent all int64 values. + + float64 has a 52-bit mantissa, so it can only exactly represent + integers up to 2**52. int64.max is 2**63 - 1, far exceeding this. + """ + store = MemoryStore() + with pytest.raises(ValueError, match="may silently lose precision"): + zarr.create( + store=store, + shape=(1,), + dtype="int64", + chunks=(1,), + codecs=[ + CastValue(data_type="float64"), + BytesCodec(), + ], + ) + + def test_int32_to_float64_ok(self) -> None: + """Casting int32 to float64 is safe because float64 has enough + mantissa bits (52) to exactly represent all int32 values (up to 2**31 - 1).""" + store = MemoryStore() + data = np.array([np.iinfo(np.int32).max, np.iinfo(np.int32).min], dtype="int32") + arr = zarr.create( + store=store, + shape=data.shape, + dtype="int32", + chunks=(2,), + codecs=[ + CastValue(data_type="float64"), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, data) + + def test_rounding_nearest_even(self) -> None: + """nearest-even rounding: 0.5 rounds to 0, 1.5 rounds to 2.""" + store = MemoryStore() + data = np.array([0.5, 1.5, 2.5, 3.5], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(4,), + codecs=[ + CastValue(data_type="int32", rounding="nearest-even"), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, [0.0, 2.0, 2.0, 4.0]) + + def test_rounding_towards_positive(self) -> None: + """towards-positive rounds up (ceil).""" + store = MemoryStore() + data = np.array([1.1, -1.1, 1.9, -1.9], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(4,), + codecs=[ + CastValue(data_type="int32", rounding="towards-positive"), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, [2.0, -1.0, 2.0, -1.0]) + + def test_rounding_towards_negative(self) -> None: + """towards-negative rounds down (floor).""" + store = MemoryStore() + data = np.array([1.1, -1.1, 1.9, -1.9], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(4,), + codecs=[ + CastValue(data_type="int32", rounding="towards-negative"), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, [1.0, -2.0, 1.0, -2.0]) + + def test_rounding_nearest_away(self) -> None: + """nearest-away rounds 0.5 away from zero.""" + store = MemoryStore() + data = np.array([0.5, 1.5, -0.5, -1.5], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(4,), + codecs=[ + CastValue(data_type="int32", rounding="nearest-away"), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, [1.0, 2.0, -1.0, -2.0]) + + def test_out_of_range_errors_by_default(self) -> None: + """Without out_of_range, values outside target range should error.""" + store = MemoryStore() + data = np.array([300.0], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(1,), + codecs=[CastValue(data_type="uint8"), BytesCodec()], + ) + with pytest.raises(ValueError, match="out of range"): + arr[:] = data + + def test_wrap_only_valid_for_integers(self) -> None: + """wrap should be rejected for float target types.""" + with pytest.raises(ValueError, match="only valid for integer"): + zarr.create( + store=MemoryStore(), + shape=(5,), + dtype="float64", + chunks=(5,), + codecs=[ + CastValue(data_type="float32", out_of_range="wrap"), + BytesCodec(), + ], + ) + + def test_validate_rejects_complex_source(self) -> None: + """Validate should reject complex source dtype.""" + with pytest.raises(ValueError, match="only supports integer and floating-point"): + zarr.create( + store=MemoryStore(), + shape=(5,), + dtype="complex128", + chunks=(5,), + codecs=[CastValue(data_type="float64"), BytesCodec()], + ) + + def test_int32_to_int16_clamp(self) -> None: + """Integer-to-integer cast with clamping.""" + store = MemoryStore() + data = np.array([0, 100, 40000, -40000], dtype="int32") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(4,), + codecs=[ + CastValue(data_type="int16", out_of_range="clamp"), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, [0, 100, 32767, -32768]) + + def test_to_dict(self) -> None: + """Serialization to dict.""" + codec = CastValue( + data_type="uint8", + rounding="towards-zero", + out_of_range="clamp", + scalar_map={"encode": [["NaN", 0]], "decode": [[0, "NaN"]]}, # type: ignore[arg-type] + ) + d = codec.to_dict() + assert d == { + "name": "cast_value", + "configuration": { + "data_type": "uint8", + "rounding": "towards-zero", + "out_of_range": "clamp", + "scalar_map": {"encode": [["NaN", 0]], "decode": [[0, "NaN"]]}, + }, + } + + def test_to_dict_minimal(self) -> None: + """Only required fields in dict when defaults are used.""" + codec = CastValue(data_type="float32") + d = codec.to_dict() + assert d == {"name": "cast_value", "configuration": {"data_type": "float32"}} + + def test_from_dict(self) -> None: + """Deserialization from dict.""" + codec = CastValue.from_dict( + { + "name": "cast_value", + "configuration": { + "data_type": "uint8", + "rounding": "towards-zero", + "out_of_range": "clamp", + }, + } + ) + assert codec.dtype == get_data_type_from_json("uint8", zarr_format=3) + assert codec.rounding == "towards-zero" + assert codec.out_of_range == "clamp" + + def test_roundtrip_json(self) -> None: + """to_dict -> from_dict should preserve all parameters.""" + original = CastValue( + data_type="int16", + rounding="towards-negative", + out_of_range="clamp", + scalar_map={"encode": [["NaN", 0]]}, # type: ignore[arg-type] + ) + restored = CastValue.from_dict(original.to_dict()) + assert restored.dtype == original.dtype + assert restored.rounding == original.rounding + assert restored.out_of_range == original.out_of_range + assert restored.scalar_map == original.scalar_map + + def test_fill_value_cast(self) -> None: + """Fill value should be cast to the target dtype.""" + store = MemoryStore() + arr = zarr.create( + store=store, + shape=(5,), + dtype="float64", + chunks=(5,), + fill_value=42.0, + codecs=[CastValue(data_type="int32"), BytesCodec()], + ) + result = arr[:] + np.testing.assert_array_equal(result, np.full(5, 42.0)) + + def test_computed_encoded_size(self) -> None: + """Encoded size should reflect the target dtype's item size.""" + codec = CastValue(data_type="uint8") + from zarr.core.array_spec import ArrayConfig, ArraySpec + from zarr.core.buffer.cpu import buffer_prototype + from zarr.core.dtype import parse_dtype + + spec = ArraySpec( + shape=(10,), + dtype=parse_dtype("float64", zarr_format=3), + fill_value=0.0, + config=ArrayConfig.from_dict({}), + prototype=buffer_prototype, + ) + # 10 float64 elements = 80 bytes input, 10 uint8 elements = 10 bytes output + assert codec.compute_encoded_size(80, spec) == 10 + + +class TestScaleOffsetAndCastValueCombined: + """Tests for the combined scale_offset + cast_value codec pipeline.""" + + def test_float64_to_uint8_roundtrip(self) -> None: + """Typical usage: float64 -> scale_offset -> cast_value(uint8) -> bytes.""" + store = MemoryStore() + # Data in range [0, 25.5] maps to [0, 255] with scale=10 + data = np.array([0.0, 1.0, 2.5, 10.0, 25.5], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(5,), + codecs=[ + ScaleOffset(offset=0, scale=10), + CastValue(data_type="uint8", out_of_range="clamp"), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + np.testing.assert_allclose(result, data, atol=0.1) # type: ignore[arg-type] + + def test_temperature_storage_pattern(self) -> None: + """Realistic pattern: store temperature data as uint8. + + Temperature range: -10°C to 45°C + Encode: (temp - (-10)) * (255/55) = (temp + 10) * 4.636... + Use offset=-10, scale=255/55 + """ + store = MemoryStore() + offset = -10.0 + scale = 255.0 / 55.0 + data = np.array([-10.0, 0.0, 20.0, 37.5, 45.0], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(5,), + codecs=[ + ScaleOffset(offset=offset, scale=scale), + CastValue(data_type="uint8", out_of_range="clamp"), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + # Precision limited by uint8 quantization (~0.22°C step) + np.testing.assert_allclose(result, data, atol=0.25) # type: ignore[arg-type] + + def test_nan_handling_pipeline(self) -> None: + """NaN values should be handled via scalar_map in cast_value.""" + store = MemoryStore() + data = np.array([1.0, float("nan"), 3.0], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(3,), + fill_value=float("nan"), + codecs=[ + ScaleOffset(offset=0, scale=1), + CastValue( + data_type="uint8", + out_of_range="clamp", + scalar_map={ # type: ignore[arg-type] + "encode": [["NaN", 0]], + "decode": [[0, "NaN"]], + }, + ), + BytesCodec(), + ], + ) + arr[:] = data + result = arr[:] + assert result[0] == 1.0 # type: ignore[index] + assert np.isnan(result[1]) # type: ignore[index] + assert result[2] == 3.0 # type: ignore[index] + + def test_metadata_persistence(self) -> None: + """Array metadata should be correctly persisted and reloaded.""" + store = MemoryStore() + zarr.create( + store=store, + shape=(10,), + dtype="float64", + chunks=(10,), + codecs=[ + ScaleOffset(offset=5, scale=0.5), + CastValue(data_type="int16", out_of_range="clamp"), + BytesCodec(), + ], + ) + # Reopen from same store + arr2 = zarr.open_array(store, mode="r") + assert arr2.dtype == np.dtype("float64") + assert arr2.shape == (10,) diff --git a/tests/test_codecs/test_scale_offset.py b/tests/test_codecs/test_scale_offset.py new file mode 100644 index 0000000000..ace3445c10 --- /dev/null +++ b/tests/test_codecs/test_scale_offset.py @@ -0,0 +1,166 @@ +"""Tests for the scale_offset codec.""" + +from __future__ import annotations + +import numpy as np +import pytest + +import zarr +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.scale_offset import ScaleOffset +from zarr.storage import MemoryStore + + +class TestScaleOffsetCodec: + """Tests for the scale_offset codec.""" + + def test_identity(self) -> None: + """Default parameters (offset=0, scale=1) should be a no-op.""" + store = MemoryStore() + data = np.arange(20, dtype="float64").reshape(4, 5) + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(4, 5), + codecs=[ScaleOffset(), BytesCodec()], + ) + arr[:] = data + np.testing.assert_array_equal(arr[:], data) + + def test_encode_decode_float64(self) -> None: + """Encode/decode round-trip with float64 data.""" + store = MemoryStore() + data = np.array([10.0, 20.0, 30.0, 40.0, 50.0], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(5,), + codecs=[ScaleOffset(offset=10, scale=0.1), BytesCodec()], + ) + arr[:] = data + result = arr[:] + np.testing.assert_allclose(result, data, rtol=1e-10) # type: ignore[arg-type] + + def test_encode_decode_float32(self) -> None: + """Round-trip with float32 data.""" + store = MemoryStore() + data = np.array([1.0, 2.0, 3.0], dtype="float32") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(3,), + codecs=[ScaleOffset(offset=1, scale=2), BytesCodec()], + ) + arr[:] = data + result = arr[:] + np.testing.assert_allclose(result, data, rtol=1e-6) # type: ignore[arg-type] + + def test_encode_decode_integer(self) -> None: + """Round-trip with integer data (uses integer arithmetic semantics).""" + store = MemoryStore() + data = np.array([10, 20, 30, 40, 50], dtype="int32") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(5,), + codecs=[ScaleOffset(offset=10, scale=1), BytesCodec()], + ) + arr[:] = data + result = arr[:] + np.testing.assert_array_equal(result, data) + + def test_offset_only(self) -> None: + """Test with only offset (scale=1).""" + store = MemoryStore() + data = np.array([100.0, 200.0, 300.0], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(3,), + codecs=[ScaleOffset(offset=100), BytesCodec()], + ) + arr[:] = data + np.testing.assert_allclose(arr[:], data) # type: ignore[arg-type] + + def test_scale_only(self) -> None: + """Test with only scale (offset=0).""" + store = MemoryStore() + data = np.array([1.0, 2.0, 3.0], dtype="float64") + arr = zarr.create( + store=store, + shape=data.shape, + dtype=data.dtype, + chunks=(3,), + codecs=[ScaleOffset(scale=10), BytesCodec()], + ) + arr[:] = data + np.testing.assert_allclose(arr[:], data) # type: ignore[arg-type] + + def test_fill_value_transformed(self) -> None: + """Fill value should be transformed through the codec.""" + store = MemoryStore() + arr = zarr.create( + store=store, + shape=(5,), + dtype="float64", + chunks=(5,), + fill_value=100.0, + codecs=[ScaleOffset(offset=10, scale=2), BytesCodec()], + ) + # Without writing, reading should return the fill value + result = arr[:] + np.testing.assert_allclose(result, np.full(5, 100.0)) # type: ignore[arg-type] + + def test_validate_rejects_complex(self) -> None: + """Validate should reject complex dtypes.""" + with pytest.raises(ValueError, match="only supports integer and floating-point"): + zarr.create( + store=MemoryStore(), + shape=(5,), + dtype="complex128", + chunks=(5,), + codecs=[ScaleOffset(offset=1, scale=2), BytesCodec()], + ) + + def test_to_dict_no_config(self) -> None: + """Default codec should serialize without configuration.""" + codec = ScaleOffset() + assert codec.to_dict() == {"name": "scale_offset"} # type: ignore[comparison-overlap] + + def test_to_dict_with_config(self) -> None: + """Non-default codec should include configuration.""" + codec = ScaleOffset(offset=5, scale=0.1) + d = codec.to_dict() + assert d == {"name": "scale_offset", "configuration": {"offset": 5, "scale": 0.1}} # type: ignore[comparison-overlap] + + def test_to_dict_offset_only(self) -> None: + """Only offset in config when scale is default.""" + codec = ScaleOffset(offset=5) + d = codec.to_dict() + assert d == {"name": "scale_offset", "configuration": {"offset": 5}} # type: ignore[comparison-overlap] + + def test_from_dict_no_config(self) -> None: + """Parse codec from JSON with no configuration.""" + codec = ScaleOffset.from_dict({"name": "scale_offset"}) + assert codec.offset == 0 + assert codec.scale == 1 + + def test_from_dict_with_config(self) -> None: + """Parse codec from JSON with configuration.""" + codec = ScaleOffset.from_dict( + {"name": "scale_offset", "configuration": {"offset": 5, "scale": 0.1}} + ) + assert codec.offset == 5 + assert codec.scale == 0.1 + + def test_roundtrip_json(self) -> None: + """to_dict -> from_dict should preserve parameters.""" + original = ScaleOffset(offset=3.14, scale=2.71) + restored = ScaleOffset.from_dict(original.to_dict()) + assert restored.offset == original.offset + assert restored.scale == original.scale