Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ repos:
- id: yamllint
exclude: pre-commit-config.yaml
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.14.13"
rev: "v0.14.14"
hooks:
- id: ruff-format
- id: ruff-check
Expand Down
6 changes: 4 additions & 2 deletions src/ome_arrow/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ def __init__(

# --- 5) Plain dict matching the schema -----------------------------------
elif isinstance(data, dict):
self.data = pa.scalar(data, type=OME_ARROW_STRUCT)
record = {f.name: data.get(f.name) for f in OME_ARROW_STRUCT}
self.data = pa.scalar(record, type=OME_ARROW_STRUCT)
if image_type is not None:
self.data = self._wrap_with_image_type(self.data, image_type)

Expand Down Expand Up @@ -239,7 +240,8 @@ def export( # noqa: PLR0911
compression / compression_level / tile:
OME-TIFF options (passed through to tifffile via BioIO).
chunks / zarr_compressor / zarr_level :
OME-Zarr options (chunk shape, compressor hint, level).
OME-Zarr options (chunk shape, compressor hint, level). If chunks is
None, a TCZYX default is chosen (1,1,<=4,<=512,<=512).
use_channel_colors:
Try to embed per-channel display colors when safe; otherwise omitted.
parquet_*:
Expand Down
202 changes: 193 additions & 9 deletions src/ome_arrow/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def to_numpy(
Convert an OME-Arrow record into a NumPy array shaped (T,C,Z,Y,X).

The OME-Arrow "planes" are flattened YX slices indexed by (z, t, c).
This function reconstitutes them into a dense TCZYX ndarray.
When chunks are present, this function reconstitutes the dense TCZYX array
from chunked pixels instead of planes.

Args:
data:
Expand Down Expand Up @@ -58,7 +59,7 @@ def to_numpy(
if sx <= 0 or sy <= 0 or sz <= 0 or sc <= 0 or st <= 0:
raise ValueError("All size_* fields must be positive integers.")

expected_len = sx * sy
expected_plane_len = sx * sy

# Prepare target array (T,C,Z,Y,X), zero-filled by default.
out = np.zeros((st, sc, sz, sy, sx), dtype=dtype)
Expand All @@ -78,6 +79,52 @@ def _cast_plane(a: np.ndarray) -> np.ndarray:
a = np.clip(a, lo, hi)
return a.astype(dtype, copy=False)

chunks = data.get("chunks") or []
if chunks:
chunk_grid = data.get("chunk_grid") or {}
chunk_order = str(chunk_grid.get("chunk_order") or "ZYX").upper()
if chunk_order != "ZYX":
raise ValueError("Only chunk_order='ZYX' is supported for now.")

for i, ch in enumerate(chunks):
t = int(ch["t"])
c = int(ch["c"])
z = int(ch["z"])
y = int(ch["y"])
x = int(ch["x"])
shape_z = int(ch["shape_z"])
shape_y = int(ch["shape_y"])
shape_x = int(ch["shape_x"])

if not (0 <= t < st and 0 <= c < sc and 0 <= z < sz):
raise ValueError(
f"chunks[{i}] index out of range: (t,c,z)=({t},{c},{z})"
)
if y < 0 or x < 0 or shape_z <= 0 or shape_y <= 0 or shape_x <= 0:
raise ValueError(f"chunks[{i}] has invalid shape or origin.")

pix = ch["pixels"]
try:
n = len(pix)
except Exception as e:
raise ValueError(f"chunks[{i}].pixels is not a sequence") from e

expected_len = shape_z * shape_y * shape_x
if n != expected_len:
if strict:
raise ValueError(
f"chunks[{i}].pixels length {n} != expected {expected_len}"
)
if n > expected_len:
pix = pix[:expected_len]
else:
pix = list(pix) + [0] * (expected_len - n)

arr3d = np.asarray(pix).reshape(shape_z, shape_y, shape_x)
arr3d = _cast_plane(arr3d)
out[t, c, z : z + shape_z, y : y + shape_y, x : x + shape_x] = arr3d
return out
Comment on lines +82 to +126
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Validate chunk extents against image bounds.

Only lower bounds are checked; chunks that extend past Z/Y/X can yield confusing shape-mismatch errors during assignment. Add upper-bound validation before reshaping/assignment.

🔧 Proposed fix
             if y < 0 or x < 0 or shape_z <= 0 or shape_y <= 0 or shape_x <= 0:
                 raise ValueError(f"chunks[{i}] has invalid shape or origin.")
+            if z + shape_z > sz or y + shape_y > sy or x + shape_x > sx:
+                raise ValueError(
+                    f"chunks[{i}] exceeds image bounds for (z,y,x)=({z},{y},{x})"
+                )
🧰 Tools
🪛 Ruff (0.14.13)

87-87: Avoid specifying long messages outside the exception class

(TRY003)


100-102: Avoid specifying long messages outside the exception class

(TRY003)


104-104: Avoid specifying long messages outside the exception class

(TRY003)


110-110: Avoid specifying long messages outside the exception class

(TRY003)


115-117: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
In `@src/ome_arrow/export.py` around lines 82 - 126, The loop over chunks
currently only checks lower bounds but must also validate that each chunk's
extents do not exceed the image bounds before reshaping/assignment: for each
chunk (variables t,c,z,y,x,shape_z,shape_y,shape_x) add checks that z + shape_z
<= sz, and that y + shape_y <= sy and x + shape_x <= sx (you can get sy,sx from
out.shape[3] and out.shape[4]) and raise a clear ValueError like f"chunks[{i}]
extent out of range: z+shape_z={z+shape_z} > sz={sz}" (and analogous messages
for y/x) before computing expected_len/reshape and assigning into
out[t,c,z:z+shape_z,y:y+shape_y,x:x+shape_x]; this prevents silent
shape-mismatch errors during arr3d assignment.


# Fill planes.
for i, p in enumerate(data.get("planes", [])):
z = int(p["z"])
Expand All @@ -94,16 +141,17 @@ def _cast_plane(a: np.ndarray) -> np.ndarray:
except Exception as e:
raise ValueError(f"planes[{i}].pixels is not a sequence") from e

if n != expected_len:
if n != expected_plane_len:
if strict:
raise ValueError(
f"planes[{i}].pixels length {n} != size_x*size_y {expected_len}"
f"planes[{i}].pixels length {n} != size_x*size_y "
f"{expected_plane_len}"
)
# Lenient mode: fix length by truncation or zero-pad.
if n > expected_len:
pix = pix[:expected_len]
if n > expected_plane_len:
pix = pix[:expected_plane_len]
else:
pix = list(pix) + [0] * (expected_len - n)
pix = list(pix) + [0] * (expected_plane_len - n)

# Reshape to (Y,X) and cast.
arr2d = np.asarray(pix).reshape(sy, sx)
Expand All @@ -113,6 +161,128 @@ def _cast_plane(a: np.ndarray) -> np.ndarray:
return out


def plane_from_chunks(
data: Dict[str, Any] | pa.StructScalar,
*,
t: int,
c: int,
z: int,
dtype: np.dtype = np.uint16,
strict: bool = True,
clamp: bool = False,
) -> np.ndarray:
"""Extract a single (t, c, z) plane using chunked pixels when available.

Args:
data: OME-Arrow data as a Python dict or a `pa.StructScalar`.
t: Time index for the plane.
c: Channel index for the plane.
z: Z index for the plane.
dtype: Output dtype (default: np.uint16).
strict: When True, raise if chunk pixels are malformed.
clamp: If True, clamp values to the valid range of the target dtype.

Returns:
np.ndarray: 2D array with shape (Y, X).

Raises:
KeyError: If required OME-Arrow fields are missing.
ValueError: If indices are out of range or pixels are malformed.
"""
if isinstance(data, pa.StructScalar):
data = data.as_py()

pm = data["pixels_meta"]
sx, sy = int(pm["size_x"]), int(pm["size_y"])
sz, sc, st = int(pm["size_z"]), int(pm["size_c"]), int(pm["size_t"])
if not (0 <= t < st and 0 <= c < sc and 0 <= z < sz):
raise ValueError(f"Requested plane (t={t}, c={c}, z={z}) out of range.")

if np.issubdtype(dtype, np.integer):
info = np.iinfo(dtype)
lo, hi = info.min, info.max
elif np.issubdtype(dtype, np.floating):
lo, hi = -np.inf, np.inf
else:
lo, hi = -np.inf, np.inf

def _cast_plane(a: np.ndarray) -> np.ndarray:
if clamp:
a = np.clip(a, lo, hi)
return a.astype(dtype, copy=False)

chunks = data.get("chunks") or []
if chunks:
chunk_grid = data.get("chunk_grid") or {}
chunk_order = str(chunk_grid.get("chunk_order") or "ZYX").upper()
if chunk_order != "ZYX":
raise ValueError("Only chunk_order='ZYX' is supported for now.")

plane = np.zeros((sy, sx), dtype=dtype)
for i, ch in enumerate(chunks):
if int(ch["t"]) != t or int(ch["c"]) != c:
continue
z0 = int(ch["z"])
szc = int(ch["shape_z"])
if not (z0 <= z < z0 + szc):
continue
y0 = int(ch["y"])
x0 = int(ch["x"])
syc = int(ch["shape_y"])
sxc = int(ch["shape_x"])
pix = ch["pixels"]
try:
n = len(pix)
except Exception as e:
raise ValueError(f"chunks[{i}].pixels is not a sequence") from e
expected_len = szc * syc * sxc
if n != expected_len:
if strict:
raise ValueError(
f"chunks[{i}].pixels length {n} != expected {expected_len}"
)
if n > expected_len:
pix = pix[:expected_len]
else:
pix = list(pix) + [0] * (expected_len - n)

slab = np.asarray(pix).reshape(szc, syc, sxc)
slab = _cast_plane(slab)
zi = z - z0
plane[y0 : y0 + syc, x0 : x0 + sxc] = slab[zi]

return plane
Comment on lines +214 to +254
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Avoid returning empty planes when no chunk matches; validate chunk bounds.

If chunks exist but none intersect the requested plane, this currently returns a zero-filled plane and skips the planes fallback. Also, negative origins or over-extent chunks can slip through. Track whether any chunk matched and only return when it did; otherwise fall back (or error) and validate bounds.

🔧 Proposed fix
-        plane = np.zeros((sy, sx), dtype=dtype)
+        plane = np.zeros((sy, sx), dtype=dtype)
+        matched = False
         for i, ch in enumerate(chunks):
             if int(ch["t"]) != t or int(ch["c"]) != c:
                 continue
             z0 = int(ch["z"])
             szc = int(ch["shape_z"])
             if not (z0 <= z < z0 + szc):
                 continue
             y0 = int(ch["y"])
             x0 = int(ch["x"])
             syc = int(ch["shape_y"])
             sxc = int(ch["shape_x"])
+            if (
+                z0 < 0
+                or y0 < 0
+                or x0 < 0
+                or szc <= 0
+                or syc <= 0
+                or sxc <= 0
+                or z0 + szc > sz
+                or y0 + syc > sy
+                or x0 + sxc > sx
+            ):
+                raise ValueError(f"chunks[{i}] exceeds image bounds.")
             pix = ch["pixels"]
             try:
                 n = len(pix)
             except Exception as e:
                 raise ValueError(f"chunks[{i}].pixels is not a sequence") from e
@@
             slab = np.asarray(pix).reshape(szc, syc, sxc)
             slab = _cast_plane(slab)
             zi = z - z0
             plane[y0 : y0 + syc, x0 : x0 + sxc] = slab[zi]
-
-        return plane
+            matched = True
+
+        if matched:
+            return plane
+        # fall through to plane-based path if no chunk matched
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
chunks = data.get("chunks") or []
if chunks:
chunk_grid = data.get("chunk_grid") or {}
chunk_order = str(chunk_grid.get("chunk_order") or "ZYX").upper()
if chunk_order != "ZYX":
raise ValueError("Only chunk_order='ZYX' is supported for now.")
plane = np.zeros((sy, sx), dtype=dtype)
for i, ch in enumerate(chunks):
if int(ch["t"]) != t or int(ch["c"]) != c:
continue
z0 = int(ch["z"])
szc = int(ch["shape_z"])
if not (z0 <= z < z0 + szc):
continue
y0 = int(ch["y"])
x0 = int(ch["x"])
syc = int(ch["shape_y"])
sxc = int(ch["shape_x"])
pix = ch["pixels"]
try:
n = len(pix)
except Exception as e:
raise ValueError(f"chunks[{i}].pixels is not a sequence") from e
expected_len = szc * syc * sxc
if n != expected_len:
if strict:
raise ValueError(
f"chunks[{i}].pixels length {n} != expected {expected_len}"
)
if n > expected_len:
pix = pix[:expected_len]
else:
pix = list(pix) + [0] * (expected_len - n)
slab = np.asarray(pix).reshape(szc, syc, sxc)
slab = _cast_plane(slab)
zi = z - z0
plane[y0 : y0 + syc, x0 : x0 + sxc] = slab[zi]
return plane
chunks = data.get("chunks") or []
if chunks:
chunk_grid = data.get("chunk_grid") or {}
chunk_order = str(chunk_grid.get("chunk_order") or "ZYX").upper()
if chunk_order != "ZYX":
raise ValueError("Only chunk_order='ZYX' is supported for now.")
plane = np.zeros((sy, sx), dtype=dtype)
matched = False
for i, ch in enumerate(chunks):
if int(ch["t"]) != t or int(ch["c"]) != c:
continue
z0 = int(ch["z"])
szc = int(ch["shape_z"])
if not (z0 <= z < z0 + szc):
continue
y0 = int(ch["y"])
x0 = int(ch["x"])
syc = int(ch["shape_y"])
sxc = int(ch["shape_x"])
if (
z0 < 0
or y0 < 0
or x0 < 0
or szc <= 0
or syc <= 0
or sxc <= 0
or z0 + szc > sz
or y0 + syc > sy
or x0 + sxc > sx
):
raise ValueError(f"chunks[{i}] exceeds image bounds.")
pix = ch["pixels"]
try:
n = len(pix)
except Exception as e:
raise ValueError(f"chunks[{i}].pixels is not a sequence") from e
expected_len = szc * syc * sxc
if n != expected_len:
if strict:
raise ValueError(
f"chunks[{i}].pixels length {n} != expected {expected_len}"
)
if n > expected_len:
pix = pix[:expected_len]
else:
pix = list(pix) + [0] * (expected_len - n)
slab = np.asarray(pix).reshape(szc, syc, sxc)
slab = _cast_plane(slab)
zi = z - z0
plane[y0 : y0 + syc, x0 : x0 + sxc] = slab[zi]
matched = True
if matched:
return plane
# fall through to plane-based path if no chunk matched
🧰 Tools
🪛 Ruff (0.14.13)

219-219: Avoid specifying long messages outside the exception class

(TRY003)


237-237: Avoid specifying long messages outside the exception class

(TRY003)


241-243: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
In `@src/ome_arrow/export.py` around lines 214 - 254, The code currently returns a
zero-filled plane even when no chunk matched and does not validate chunk bounds;
update the chunks handling in the export routine (the loop that iterates over
chunks and uses variables chunk_grid, chunks, strict, _cast_plane, plane) to (1)
track a boolean like any_chunk_matched set to True whenever a chunk is applied
to the plane and only return plane if any_chunk_matched is True, otherwise fall
through to the existing fallback or raise; and (2) validate chunk origins and
shapes (z0, y0, x0 >= 0 and z0+szc <= depth, y0+syc <= sy, x0+sxc <= sx) before
slicing/assigning and raise ValueError (or honor strict) for out-of-bounds or
negative origins so malformed chunks cannot silently write/clip into the plane.


# Fallback to planes list if chunks are absent.
target = next(
(
p
for p in data.get("planes", [])
if int(p["t"]) == t and int(p["c"]) == c and int(p["z"]) == z
),
None,
)
if target is None:
raise ValueError(f"plane (t={t}, c={c}, z={z}) not found")

pix = target["pixels"]
try:
n = len(pix)
except Exception as e:
raise ValueError("plane pixels is not a sequence") from e
expected_len = sx * sy
if n != expected_len:
if strict:
raise ValueError(f"plane pixels length {n} != size_x*size_y {expected_len}")
if n > expected_len:
pix = pix[:expected_len]
else:
pix = list(pix) + [0] * (expected_len - n)

arr2d = np.asarray(pix).reshape(sy, sx)
return _cast_plane(arr2d)


def to_ome_tiff(
data: Dict[str, Any] | pa.StructScalar,
out_path: str,
Expand Down Expand Up @@ -255,6 +425,7 @@ def to_ome_zarr(
- Creates level shapes for a multiscale pyramid (if multiscale_levels>1).
- Chooses Blosc codec compatible with zarr_format (v2 vs v3).
- Populates axes names/types/units and physical pixel sizes from pixels_meta.
- Uses default TCZYX chunks if none are provided.
"""
# --- local import to avoid hard deps at module import time
# Use the class you showed
Expand Down Expand Up @@ -317,6 +488,15 @@ def to_ome_zarr(
def _down(a: int, f: int) -> int:
return max(1, a // f)

def _default_chunks_tcxyz(
shape: Tuple[int, int, int, int, int],
) -> Tuple[int, int, int, int, int]:
_t, _c, z, y, x = shape
cz = min(z, 4) if z > 1 else 1
cy = min(y, 512)
cx = min(x, 512)
return (1, 1, cz, cy, cx)

def _level_shapes_tcxyz(levels: int) -> List[Tuple[int, int, int, int, int]]:
shapes = [(st, sc, sz, sy, sx)]
for _ in range(levels - 1):
Expand All @@ -340,6 +520,8 @@ def _level_shapes_tcxyz(levels: int) -> List[Tuple[int, int, int, int, int]]:
# 5) Chunking / shards (can be single-shape or per-level;
# we pass single-shape if provided)
chunk_shape: Optional[List[Tuple[int, ...]]] = None
if chunks is None:
chunks = _default_chunks_tcxyz((st, sc, sz, sy, sx))
if chunks is not None:
chunk_shape = [tuple(int(v) for v in chunks)] * multiscale_levels

Expand Down Expand Up @@ -393,7 +575,8 @@ def to_ome_parquet(
record_dict = data.as_py()
else:
# Validate by round-tripping through a typed scalar, then back to dict.
record_dict = pa.scalar(data, type=OME_ARROW_STRUCT).as_py()
record_dict = {f.name: data.get(f.name) for f in OME_ARROW_STRUCT}
record_dict = pa.scalar(record_dict, type=OME_ARROW_STRUCT).as_py()

# 2) Build a single-row struct array from the dict, explicitly passing the schema
struct_array = pa.array([record_dict], type=OME_ARROW_STRUCT) # len=1
Expand Down Expand Up @@ -456,7 +639,8 @@ def to_ome_vortex(
record_dict = data.as_py()
else:
# Validate by round-tripping through a typed scalar, then back to dict.
record_dict = pa.scalar(data, type=OME_ARROW_STRUCT).as_py()
record_dict = {f.name: data.get(f.name) for f in OME_ARROW_STRUCT}
record_dict = pa.scalar(record_dict, type=OME_ARROW_STRUCT).as_py()

# 2) Build a single-row struct array from the dict, explicitly passing the schema
struct_array = pa.array([record_dict], type=OME_ARROW_STRUCT) # len=1
Expand Down
Loading