From 8027c0bc88c903895bde5b111044e37d3641e226 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Sun, 16 Feb 2025 13:36:41 +0100 Subject: [PATCH 01/39] feature: Initial lazy tiff reader --- src/spatialdata_io/readers/generic.py | 186 ++++++++++++++++++++++++-- 1 file changed, 176 insertions(+), 10 deletions(-) diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 24ee7071..30a16dd7 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -1,23 +1,34 @@ from __future__ import annotations import warnings -from collections.abc import Sequence +from collections.abc import Callable, Sequence from pathlib import Path +from typing import Any, Protocol, TypeVar +import dask.array as da import numpy as np -from dask_image.imread import imread +from dask import delayed from geopandas import GeoDataFrame +from numpy.typing import NDArray from spatialdata._docs import docstring_parameter from spatialdata.models import Image2DModel, ShapesModel from spatialdata.models._utils import DEFAULT_COORDINATE_SYSTEM from spatialdata.transformations import Identity +from tifffile import memmap as tiffmmemap from xarray import DataArray -VALID_IMAGE_TYPES = [".tif", ".tiff", ".png", ".jpg", ".jpeg"] +VALID_IMAGE_TYPES = [".tif", ".tiff"] VALID_SHAPE_TYPES = [".geojson"] +DEFAULT_CHUNKSIZE = (1000, 1000) __all__ = ["generic", "geojson", "image", "VALID_IMAGE_TYPES", "VALID_SHAPE_TYPES"] +T = TypeVar("T", bound=np.generic) # Restrict to NumPy scalar types + + +class DaskArray(Protocol[T]): + dtype: np.dtype[T] + @docstring_parameter( valid_image_types=", ".join(VALID_IMAGE_TYPES), @@ -68,11 +79,166 @@ def geojson(input: Path, coordinate_system: str) -> GeoDataFrame: return ShapesModel.parse(input, transformations={coordinate_system: Identity()}) +def _compute_sizes_positions(size: int, chunk: int, min_coord: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: + """Calculate chunk sizes and positions for a given dimension and chunk size""" + # All chunks have the same size except for the last one + positions = np.arange(min_coord, size, chunk) + lengths = np.full_like(positions, chunk, dtype=int) + + if positions[-1] + chunk > size: + lengths[-1] = size - positions[-1] + + return positions, lengths + + +def _compute_chunks( + dimensions: tuple[int, int], + chunk_size: tuple[int, int], + min_coordinates: tuple[int, int] = (0, 0), +) -> NDArray[np.int_]: + """Create all chunk specs for a given image and chunk size. + + Creates specifications (x, y, width, height) with (x, y) being the upper left corner + of chunks of size chunk_size. Chunks at the edges correspond to the remainder of + chunk size and dimensions + + Parameters + ---------- + dimensions : tuple[int, int] + Size of the image in (width, height). + chunk_size : tuple[int, int] + Size of individual tiles in (width, height). + min_coordinates : tuple[int, int], optional + Minimum coordinates (x, y) in the image, defaults to (0, 0). + + Returns + ------- + np.ndarray + Array of shape (n_tiles_x, n_tiles_y, 4). Each entry defines a tile + as (x, y, width, height). + """ + x_positions, widths = _compute_sizes_positions(dimensions[1], chunk_size[1], min_coord=min_coordinates[1]) + y_positions, heights = _compute_sizes_positions(dimensions[0], chunk_size[0], min_coord=min_coordinates[0]) + + # Generate the tiles + tiles = np.array( + [ + [[x, y, w, h] for y, h in zip(y_positions, heights, strict=True)] + for x, w in zip(x_positions, widths, strict=True) + ], + dtype=int, + ) + return tiles + + +def _read_chunks( + func: Callable[..., NDArray[np.int_]], + slide: Any, + coords: NDArray[np.int_], + n_channel: int, + dtype: type, + **func_kwargs: Any, +) -> list[list[da.array]]: + """Abstract factory method to tile a large microscopy image. + + Parameters + ---------- + func + Function to retrieve a rectangular tile from the slide image. Must take the + arguments: + + - slide Full slide image + - x0: x (col) coordinate of upper left corner of chunk + - y0: y (row) coordinate of upper left corner of chunk + - width: Width of chunk + - height: Height of chunk + + and should return the chunk as numpy array of shape (c, y, x) + slide + Slide image in lazyly loaded format compatible with func + coords + Coordinates of the upper left corner of the image in format (n_row_x, n_row_y, 4) + where the last dimension defines the rectangular tile in format (x, y, width, height). + n_row_x represents the number of chunks in x dimension and n_row_y the number of chunks + in y dimension. + n_channel + Number of channels in array + dtype + Data type of image + func_kwargs + Additional keyword arguments passed to func + + Returns + ------- + list[list[da.array]] + List (length: n_row_x) of lists (length: n_row_y) of chunks. + Represents all chunks of the full image. + """ + func_kwargs = func_kwargs if func_kwargs else {} + + # Collect each delayed chunk as item in list of list + # Inner list becomes dim=-1 (cols/x) + # Outer list becomes dim=-2 (rows/y) + # see dask.array.block + chunks = [ + [ + da.from_delayed( + delayed(func)( + slide, + x0=coords[x, y, 0], + y0=coords[x, y, 1], + width=coords[x, y, 2], + height=coords[x, y, 3], + **func_kwargs, + ), + dtype=dtype, + shape=(n_channel, *coords[x, y, [2, 3]]), + ) + for y in range(coords.shape[0]) + ] + for x in range(coords.shape[1]) + ] + return chunks + + +def _tiff_to_chunks(input: Path) -> list[list[DaskArray[np.int_]]]: + """Chunkwise reader for tiff files. + + Parameters + ---------- + input + Path to image + + Returns + ------- + list[list[DaskArray]] + """ + # Lazy file reader + slide = tiffmmemap(input) + + # Get dimensions in (y, x) + slide_dimensions = slide.shape[:-1] + + # Get number of channels (c) + n_channel = slide.shape[-1] + + # Compute chunk coords + chunk_coords = _compute_chunks(slide_dimensions, chunk_size=DEFAULT_CHUNKSIZE, min_coordinates=(0, 0)) + + # Define reader func + def _reader_func(slide: NDArray[np.int_], x0: int, y0: int, width: int, height: int) -> NDArray[np.int_]: + return np.array(slide[y0 : y0 + height, x0 : x0 + width, :]).T + + return _read_chunks(_reader_func, slide, coords=chunk_coords, n_channel=n_channel, dtype=slide.dtype) + + def image(input: Path, data_axes: Sequence[str], coordinate_system: str) -> DataArray: - """Reads an image file and returns a parsed Image2D spatial element""" - # this function is just a draft, the more general one will be available when - # https://github.com/scverse/spatialdata-io/pull/234 is merged - image = imread(input) - if len(image.shape) == len(data_axes) + 1 and image.shape[0] == 1: - image = np.squeeze(image, axis=0) - return Image2DModel.parse(image, dims=data_axes, transformations={coordinate_system: Identity()}) + """Reads an image file and returns a parsed Image2DModel""" + if input.suffix in [".tiff", ".tif"]: + chunks = _tiff_to_chunks(input) + else: + raise NotImplementedError(f"File format {input.suffix} not implemented") + + img = da.block(chunks, allow_unknown_chunksizes=True) + + return Image2DModel.parse(img, dims=data_axes, transformations={coordinate_system: Identity()}) From 7ff1d2340ef2b284067299b28de4848026517387 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Sun, 16 Feb 2025 14:48:20 +0100 Subject: [PATCH 02/39] Updated comments --- src/spatialdata_io/readers/generic.py | 38 ++++++++++++++++----------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 30a16dd7..6f902b6a 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -123,8 +123,8 @@ def _compute_chunks( # Generate the tiles tiles = np.array( [ - [[x, y, w, h] for y, h in zip(y_positions, heights, strict=True)] - for x, w in zip(x_positions, widths, strict=True) + [[x, y, w, h] for x, w in zip(x_positions, widths, strict=True)] + for y, h in zip(y_positions, heights, strict=True) ], dtype=int, ) @@ -136,7 +136,7 @@ def _read_chunks( slide: Any, coords: NDArray[np.int_], n_channel: int, - dtype: type, + dtype: np.number, **func_kwargs: Any, ) -> list[list[da.array]]: """Abstract factory method to tile a large microscopy image. @@ -185,23 +185,23 @@ def _read_chunks( da.from_delayed( delayed(func)( slide, - x0=coords[x, y, 0], - y0=coords[x, y, 1], - width=coords[x, y, 2], - height=coords[x, y, 3], + x0=coords[y, x, 0], + y0=coords[y, x, 1], + width=coords[y, x, 2], + height=coords[y, x, 3], **func_kwargs, ), dtype=dtype, - shape=(n_channel, *coords[x, y, [2, 3]]), + shape=(n_channel, *coords[y, x, [3, 2]]), ) - for y in range(coords.shape[0]) + for x in range(coords.shape[1]) ] - for x in range(coords.shape[1]) + for y in range(coords.shape[0]) ] return chunks -def _tiff_to_chunks(input: Path) -> list[list[DaskArray[np.int_]]]: +def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[DaskArray[np.int_]]]: """Chunkwise reader for tiff files. Parameters @@ -216,26 +216,32 @@ def _tiff_to_chunks(input: Path) -> list[list[DaskArray[np.int_]]]: # Lazy file reader slide = tiffmmemap(input) - # Get dimensions in (y, x) - slide_dimensions = slide.shape[:-1] + # Transpose to cyx order + slide = np.transpose(slide, (axes_dim_mapping["c"], axes_dim_mapping["y"], axes_dim_mapping["x"])) + + # Get dimensions in (x, y) + slide_dimensions = slide.shape[2], slide.shape[1] # Get number of channels (c) - n_channel = slide.shape[-1] + n_channel = slide.shape[0] # Compute chunk coords chunk_coords = _compute_chunks(slide_dimensions, chunk_size=DEFAULT_CHUNKSIZE, min_coordinates=(0, 0)) # Define reader func def _reader_func(slide: NDArray[np.int_], x0: int, y0: int, width: int, height: int) -> NDArray[np.int_]: - return np.array(slide[y0 : y0 + height, x0 : x0 + width, :]).T + return np.array(slide[:, y0 : y0 + height, x0 : x0 + width]) return _read_chunks(_reader_func, slide, coords=chunk_coords, n_channel=n_channel, dtype=slide.dtype) def image(input: Path, data_axes: Sequence[str], coordinate_system: str) -> DataArray: """Reads an image file and returns a parsed Image2DModel""" + # Map passed data axes to position of dimension + axes_dim_mapping = {axes: ndim for ndim, axes in enumerate(data_axes)} + if input.suffix in [".tiff", ".tif"]: - chunks = _tiff_to_chunks(input) + chunks = _tiff_to_chunks(input, axes_dim_mapping=axes_dim_mapping) else: raise NotImplementedError(f"File format {input.suffix} not implemented") From 826133ab4cac7619cc8f46aab8eeb6dfe632595a Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Sun, 16 Feb 2025 14:51:46 +0100 Subject: [PATCH 03/39] Updated comments --- src/spatialdata_io/readers/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 6f902b6a..3cac8876 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -79,7 +79,7 @@ def geojson(input: Path, coordinate_system: str) -> GeoDataFrame: return ShapesModel.parse(input, transformations={coordinate_system: Identity()}) -def _compute_sizes_positions(size: int, chunk: int, min_coord: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: +def _compute_chunk_sizes_positions(size: int, chunk: int, min_coord: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: """Calculate chunk sizes and positions for a given dimension and chunk size""" # All chunks have the same size except for the last one positions = np.arange(min_coord, size, chunk) @@ -117,8 +117,8 @@ def _compute_chunks( Array of shape (n_tiles_x, n_tiles_y, 4). Each entry defines a tile as (x, y, width, height). """ - x_positions, widths = _compute_sizes_positions(dimensions[1], chunk_size[1], min_coord=min_coordinates[1]) - y_positions, heights = _compute_sizes_positions(dimensions[0], chunk_size[0], min_coord=min_coordinates[0]) + x_positions, widths = _compute_chunk_sizes_positions(dimensions[1], chunk_size[1], min_coord=min_coordinates[1]) + y_positions, heights = _compute_chunk_sizes_positions(dimensions[0], chunk_size[0], min_coord=min_coordinates[0]) # Generate the tiles tiles = np.array( @@ -139,7 +139,7 @@ def _read_chunks( dtype: np.number, **func_kwargs: Any, ) -> list[list[da.array]]: - """Abstract factory method to tile a large microscopy image. + """Abstract method to tile a large microscopy image. Parameters ---------- From df258bc7f79811a0fca6e873fdadceca198faa38 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Sun, 16 Feb 2025 15:03:04 +0100 Subject: [PATCH 04/39] Move utility functions to designated submodule readers._utils._image --- src/spatialdata_io/readers/_utils/_image.py | 129 +++++++++++++++++++ src/spatialdata_io/readers/generic.py | 131 +------------------- 2 files changed, 135 insertions(+), 125 deletions(-) create mode 100644 src/spatialdata_io/readers/_utils/_image.py diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py new file mode 100644 index 00000000..ab175512 --- /dev/null +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -0,0 +1,129 @@ +from collections.abc import Callable +from typing import Any + +import dask.array as da +import numpy as np +from dask import delayed +from numpy.typing import NDArray + + +def _compute_chunk_sizes_positions(size: int, chunk: int, min_coord: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: + """Calculate chunk sizes and positions for a given dimension and chunk size""" + # All chunks have the same size except for the last one + positions = np.arange(min_coord, size, chunk) + lengths = np.full_like(positions, chunk, dtype=int) + + if positions[-1] + chunk > size: + lengths[-1] = size - positions[-1] + + return positions, lengths + + +def _compute_chunks( + dimensions: tuple[int, int], + chunk_size: tuple[int, int], + min_coordinates: tuple[int, int] = (0, 0), +) -> NDArray[np.int_]: + """Create all chunk specs for a given image and chunk size. + + Creates specifications (x, y, width, height) with (x, y) being the upper left corner + of chunks of size chunk_size. Chunks at the edges correspond to the remainder of + chunk size and dimensions + + Parameters + ---------- + dimensions : tuple[int, int] + Size of the image in (width, height). + chunk_size : tuple[int, int] + Size of individual tiles in (width, height). + min_coordinates : tuple[int, int], optional + Minimum coordinates (x, y) in the image, defaults to (0, 0). + + Returns + ------- + np.ndarray + Array of shape (n_tiles_x, n_tiles_y, 4). Each entry defines a tile + as (x, y, width, height). + """ + x_positions, widths = _compute_chunk_sizes_positions(dimensions[1], chunk_size[1], min_coord=min_coordinates[1]) + y_positions, heights = _compute_chunk_sizes_positions(dimensions[0], chunk_size[0], min_coord=min_coordinates[0]) + + # Generate the tiles + tiles = np.array( + [ + [[x, y, w, h] for x, w in zip(x_positions, widths, strict=True)] + for y, h in zip(y_positions, heights, strict=True) + ], + dtype=int, + ) + return tiles + + +def _read_chunks( + func: Callable[..., NDArray[np.int_]], + slide: Any, + coords: NDArray[np.int_], + n_channel: int, + dtype: np.number, + **func_kwargs: Any, +) -> list[list[da.array]]: + """Abstract method to tile a large microscopy image. + + Parameters + ---------- + func + Function to retrieve a rectangular tile from the slide image. Must take the + arguments: + + - slide Full slide image + - x0: x (col) coordinate of upper left corner of chunk + - y0: y (row) coordinate of upper left corner of chunk + - width: Width of chunk + - height: Height of chunk + + and should return the chunk as numpy array of shape (c, y, x) + slide + Slide image in lazyly loaded format compatible with func + coords + Coordinates of the upper left corner of the image in format (n_row_x, n_row_y, 4) + where the last dimension defines the rectangular tile in format (x, y, width, height). + n_row_x represents the number of chunks in x dimension and n_row_y the number of chunks + in y dimension. + n_channel + Number of channels in array + dtype + Data type of image + func_kwargs + Additional keyword arguments passed to func + + Returns + ------- + list[list[da.array]] + List (length: n_row_x) of lists (length: n_row_y) of chunks. + Represents all chunks of the full image. + """ + func_kwargs = func_kwargs if func_kwargs else {} + + # Collect each delayed chunk as item in list of list + # Inner list becomes dim=-1 (cols/x) + # Outer list becomes dim=-2 (rows/y) + # see dask.array.block + chunks = [ + [ + da.from_delayed( + delayed(func)( + slide, + x0=coords[y, x, 0], + y0=coords[y, x, 1], + width=coords[y, x, 2], + height=coords[y, x, 3], + **func_kwargs, + ), + dtype=dtype, + shape=(n_channel, *coords[y, x, [3, 2]]), + ) + for x in range(coords.shape[1]) + ] + for y in range(coords.shape[0]) + ] + return chunks diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 3cac8876..65158413 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -1,13 +1,12 @@ from __future__ import annotations import warnings -from collections.abc import Callable, Sequence +from collections.abc import Sequence from pathlib import Path -from typing import Any, Protocol, TypeVar +from typing import Protocol, TypeVar import dask.array as da import numpy as np -from dask import delayed from geopandas import GeoDataFrame from numpy.typing import NDArray from spatialdata._docs import docstring_parameter @@ -17,6 +16,8 @@ from tifffile import memmap as tiffmmemap from xarray import DataArray +from ._utils._image import _compute_chunks, _read_chunks + VALID_IMAGE_TYPES = [".tif", ".tiff"] VALID_SHAPE_TYPES = [".geojson"] DEFAULT_CHUNKSIZE = (1000, 1000) @@ -79,128 +80,6 @@ def geojson(input: Path, coordinate_system: str) -> GeoDataFrame: return ShapesModel.parse(input, transformations={coordinate_system: Identity()}) -def _compute_chunk_sizes_positions(size: int, chunk: int, min_coord: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: - """Calculate chunk sizes and positions for a given dimension and chunk size""" - # All chunks have the same size except for the last one - positions = np.arange(min_coord, size, chunk) - lengths = np.full_like(positions, chunk, dtype=int) - - if positions[-1] + chunk > size: - lengths[-1] = size - positions[-1] - - return positions, lengths - - -def _compute_chunks( - dimensions: tuple[int, int], - chunk_size: tuple[int, int], - min_coordinates: tuple[int, int] = (0, 0), -) -> NDArray[np.int_]: - """Create all chunk specs for a given image and chunk size. - - Creates specifications (x, y, width, height) with (x, y) being the upper left corner - of chunks of size chunk_size. Chunks at the edges correspond to the remainder of - chunk size and dimensions - - Parameters - ---------- - dimensions : tuple[int, int] - Size of the image in (width, height). - chunk_size : tuple[int, int] - Size of individual tiles in (width, height). - min_coordinates : tuple[int, int], optional - Minimum coordinates (x, y) in the image, defaults to (0, 0). - - Returns - ------- - np.ndarray - Array of shape (n_tiles_x, n_tiles_y, 4). Each entry defines a tile - as (x, y, width, height). - """ - x_positions, widths = _compute_chunk_sizes_positions(dimensions[1], chunk_size[1], min_coord=min_coordinates[1]) - y_positions, heights = _compute_chunk_sizes_positions(dimensions[0], chunk_size[0], min_coord=min_coordinates[0]) - - # Generate the tiles - tiles = np.array( - [ - [[x, y, w, h] for x, w in zip(x_positions, widths, strict=True)] - for y, h in zip(y_positions, heights, strict=True) - ], - dtype=int, - ) - return tiles - - -def _read_chunks( - func: Callable[..., NDArray[np.int_]], - slide: Any, - coords: NDArray[np.int_], - n_channel: int, - dtype: np.number, - **func_kwargs: Any, -) -> list[list[da.array]]: - """Abstract method to tile a large microscopy image. - - Parameters - ---------- - func - Function to retrieve a rectangular tile from the slide image. Must take the - arguments: - - - slide Full slide image - - x0: x (col) coordinate of upper left corner of chunk - - y0: y (row) coordinate of upper left corner of chunk - - width: Width of chunk - - height: Height of chunk - - and should return the chunk as numpy array of shape (c, y, x) - slide - Slide image in lazyly loaded format compatible with func - coords - Coordinates of the upper left corner of the image in format (n_row_x, n_row_y, 4) - where the last dimension defines the rectangular tile in format (x, y, width, height). - n_row_x represents the number of chunks in x dimension and n_row_y the number of chunks - in y dimension. - n_channel - Number of channels in array - dtype - Data type of image - func_kwargs - Additional keyword arguments passed to func - - Returns - ------- - list[list[da.array]] - List (length: n_row_x) of lists (length: n_row_y) of chunks. - Represents all chunks of the full image. - """ - func_kwargs = func_kwargs if func_kwargs else {} - - # Collect each delayed chunk as item in list of list - # Inner list becomes dim=-1 (cols/x) - # Outer list becomes dim=-2 (rows/y) - # see dask.array.block - chunks = [ - [ - da.from_delayed( - delayed(func)( - slide, - x0=coords[y, x, 0], - y0=coords[y, x, 1], - width=coords[y, x, 2], - height=coords[y, x, 3], - **func_kwargs, - ), - dtype=dtype, - shape=(n_channel, *coords[y, x, [3, 2]]), - ) - for x in range(coords.shape[1]) - ] - for y in range(coords.shape[0]) - ] - return chunks - - def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[DaskArray[np.int_]]]: """Chunkwise reader for tiff files. @@ -208,6 +87,8 @@ def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[ ---------- input Path to image + axes_dim_mapping + Mapping between dimension name (x, y, c) and index Returns ------- From db0d78278e13d0f35e18cc645900e82ee3a97985 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Sun, 16 Feb 2025 15:27:52 +0100 Subject: [PATCH 05/39] Initial tests utils --- tests/readers/test_utils_image.py | 59 +++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/readers/test_utils_image.py diff --git a/tests/readers/test_utils_image.py b/tests/readers/test_utils_image.py new file mode 100644 index 00000000..d9f0c295 --- /dev/null +++ b/tests/readers/test_utils_image.py @@ -0,0 +1,59 @@ +import numpy as np +import pytest +from numpy.typing import NDArray + +from spatialdata_io.readers._utils._image import ( + _compute_chunk_sizes_positions, + _compute_chunks, +) + + +@pytest.mark.parametrize( + ("size", "chunk", "min_coordinate", "positions", "lengths"), + [ + (300, 100, 0, np.array([0, 100, 200]), np.array([100, 100, 100])), + (300, 200, 0, np.array([0, 200]), np.array([200, 100])), + (300, 100, -100, np.array([-100, 0, 100]), np.array([100, 100, 100])), + (300, 200, -100, np.array([-100, 100]), np.array([200, 100])), + ], +) +def test_compute_chunk_sizes_positions( + size: int, + chunk: int, + min_coordinate: int, + positions: NDArray[np.number], + lengths: NDArray[np.number], +) -> None: + computed_positions, computed_lengths = _compute_chunk_sizes_positions(size, chunk, min_coordinate) + assert (positions == computed_positions).all() + assert (lengths == computed_lengths).all() + + +@pytest.mark.parametrize( + ("dimensions", "chunk_size", "min_coordinates", "result"), + [ + # Regular grid 2x2 + ( + (2, 2), + (1, 1), + (0, 0), + np.array([[[0, 0, 1, 1], [1, 0, 1, 1]], [[0, 1, 1, 1], [1, 1, 1, 1]]]), + ), + # Different tile sizes + ( + (3, 3), + (2, 2), + (0, 0), + np.array([[[0, 0, 2, 2], [2, 0, 1, 2]], [[0, 2, 2, 1], [2, 2, 1, 1]]]), + ), + ], +) +def test_compute_chunks( + dimensions: tuple[int, int], + chunk_size: tuple[int, int], + min_coordinates: tuple[int, int], + result: NDArray[np.number], +) -> None: + tiles = _compute_chunks(dimensions, chunk_size, min_coordinates) + + assert (tiles == result).all(), tiles From c03932f0d1f932a2b6b55a91ace1aeaede555cdd Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Sun, 16 Feb 2025 15:28:41 +0100 Subject: [PATCH 06/39] Fixes edge cases for min coordinate --- src/spatialdata_io/readers/_utils/_image.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index ab175512..28e6e992 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -10,11 +10,11 @@ def _compute_chunk_sizes_positions(size: int, chunk: int, min_coord: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: """Calculate chunk sizes and positions for a given dimension and chunk size""" # All chunks have the same size except for the last one - positions = np.arange(min_coord, size, chunk) + positions = np.arange(min_coord, min_coord + size, chunk) lengths = np.full_like(positions, chunk, dtype=int) - if positions[-1] + chunk > size: - lengths[-1] = size - positions[-1] + if positions[-1] + chunk > size + min_coord: + lengths[-1] = size + min_coord - positions[-1] return positions, lengths From 339cbd88aae511d723ddc1eebfe82496f1340cf1 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Sun, 16 Feb 2025 19:33:07 +0100 Subject: [PATCH 07/39] Added test for negative coordinates --- tests/readers/test_utils_image.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/readers/test_utils_image.py b/tests/readers/test_utils_image.py index d9f0c295..f1c58fa1 100644 --- a/tests/readers/test_utils_image.py +++ b/tests/readers/test_utils_image.py @@ -46,6 +46,12 @@ def test_compute_chunk_sizes_positions( (0, 0), np.array([[[0, 0, 2, 2], [2, 0, 1, 2]], [[0, 2, 2, 1], [2, 2, 1, 1]]]), ), + ( + (2, 2), + (1, 1), + (-1, 0), + np.array([[[0, -1, 1, 1], [1, -1, 1, 1]], [[0, 0, 1, 1], [1, 0, 1, 1]]]), + ), ], ) def test_compute_chunks( @@ -56,4 +62,4 @@ def test_compute_chunks( ) -> None: tiles = _compute_chunks(dimensions, chunk_size, min_coordinates) - assert (tiles == result).all(), tiles + assert (tiles == result).all() From 24c6eece2dfdd07f2b63a5d3807be0d689ad7818 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Mon, 17 Feb 2025 16:46:03 +0100 Subject: [PATCH 08/39] Add support for png/jpg again --- src/spatialdata_io/readers/generic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 65158413..8aec6765 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -7,6 +7,7 @@ import dask.array as da import numpy as np +from dask_image.imread import imread from geopandas import GeoDataFrame from numpy.typing import NDArray from spatialdata._docs import docstring_parameter @@ -18,7 +19,7 @@ from ._utils._image import _compute_chunks, _read_chunks -VALID_IMAGE_TYPES = [".tif", ".tiff"] +VALID_IMAGE_TYPES = [".tif", ".tiff", ".png", ".jpg", ".jpeg"] VALID_SHAPE_TYPES = [".geojson"] DEFAULT_CHUNKSIZE = (1000, 1000) @@ -123,6 +124,10 @@ def image(input: Path, data_axes: Sequence[str], coordinate_system: str) -> Data if input.suffix in [".tiff", ".tif"]: chunks = _tiff_to_chunks(input, axes_dim_mapping=axes_dim_mapping) + elif input.suffix in [".png", "jpg", "jpeg"]: + image = imread(input) + if len(image.shape) == len(data_axes) + 1 and image.shape[0] == 1: + image = np.squeeze(image, axis=0) else: raise NotImplementedError(f"File format {input.suffix} not implemented") From dbdc7c7516823eb58d2547630dffe731a646ab13 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Mon, 17 Feb 2025 17:11:06 +0100 Subject: [PATCH 09/39] Add initial test --- tests/test_generic.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_generic.py b/tests/test_generic.py index d4dda1a7..e8414fa7 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -9,9 +9,12 @@ from PIL import Image from spatialdata import SpatialData from spatialdata.datasets import blobs +from tifffile import imread as tiffread +from tifffile import imwrite as tiffwrite from spatialdata_io.__main__ import read_generic_wrapper from spatialdata_io.converters.generic_to_zarr import generic_to_zarr +from spatialdata_io.readers.generic import image @contextmanager @@ -33,6 +36,32 @@ def save_temp_files() -> Generator[tuple[Path, Path, Path], None, None]: yield jpg_path, geojson_path, Path(tmpdir) +@pytest.fixture(scope="module", params=[("c", "y", "x"), ("x", "y", "c")]) +def save_tiff_files( + request: pytest.FixtureRequest, +) -> Generator[tuple[Path, tuple[str], Path], None, None]: + with tempfile.TemporaryDirectory() as tmpdir: + axes = request.param + sdata = blobs() + # save the image as tiff + x = sdata["blobs_image"].transpose(*axes).data.compute() + img = np.clip(x * 255, 0, 255).astype(np.uint8) + + tiff_path = Path(tmpdir) / "blobs_image.tiff" + tiffwrite(tiff_path, img) + + yield tiff_path, axes, Path(tmpdir) + + +def test_read_tiff(save_tiff_files: tuple[Path, tuple[str], Path]) -> None: + tiff_path, axes, _ = save_tiff_files + img = image(tiff_path, data_axes=axes, coordinate_system="global") + + reference = tiffread(tiff_path) + + assert (img.compute() == reference).all() + + @pytest.mark.parametrize("cli", [True, False]) @pytest.mark.parametrize("element_name", [None, "test_element"]) def test_read_generic_image(runner: CliRunner, cli: bool, element_name: str | None) -> None: From da98469859daebb57753e372ad2a53dbff08fa8d Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Mon, 17 Feb 2025 17:19:45 +0100 Subject: [PATCH 10/39] Fix: Fix jpeg and png reader, fix issues with local variable name --- src/spatialdata_io/readers/generic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 8aec6765..fd8c17dd 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -124,13 +124,14 @@ def image(input: Path, data_axes: Sequence[str], coordinate_system: str) -> Data if input.suffix in [".tiff", ".tif"]: chunks = _tiff_to_chunks(input, axes_dim_mapping=axes_dim_mapping) - elif input.suffix in [".png", "jpg", "jpeg"]: + image = da.block(chunks, allow_unknown_chunksizes=True) + + elif input.suffix in [".png", ".jpg", ".jpeg"]: image = imread(input) if len(image.shape) == len(data_axes) + 1 and image.shape[0] == 1: image = np.squeeze(image, axis=0) + else: raise NotImplementedError(f"File format {input.suffix} not implemented") - img = da.block(chunks, allow_unknown_chunksizes=True) - - return Image2DModel.parse(img, dims=data_axes, transformations={coordinate_system: Identity()}) + return Image2DModel.parse(image, dims=data_axes, transformations={coordinate_system: Identity()}) From 2349be7a7d036a9c801c66f7d5b698b3d7de9ea1 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich <86159938+lucas-diedrich@users.noreply.github.com> Date: Fri, 2 May 2025 17:41:37 +0200 Subject: [PATCH 11/39] Update src/spatialdata_io/readers/_utils/_image.py Co-authored-by: Wouter-Michiel Vierdag --- src/spatialdata_io/readers/_utils/_image.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 28e6e992..8896d78f 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -20,7 +20,7 @@ def _compute_chunk_sizes_positions(size: int, chunk: int, min_coord: int) -> tup def _compute_chunks( - dimensions: tuple[int, int], + shape: tuple[int, int], chunk_size: tuple[int, int], min_coordinates: tuple[int, int] = (0, 0), ) -> NDArray[np.int_]: @@ -32,7 +32,7 @@ def _compute_chunks( Parameters ---------- - dimensions : tuple[int, int] + shape : tuple[int, int] Size of the image in (width, height). chunk_size : tuple[int, int] Size of individual tiles in (width, height). From 46ed3e5f6e12f792b7f61da396a967ee87a22e6d Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 2 May 2025 17:46:51 +0200 Subject: [PATCH 12/39] [Refactor|API] Rename dimensions to shape to stick to numpy convention --- src/spatialdata_io/readers/_utils/_image.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 8896d78f..d644f934 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -45,8 +45,8 @@ def _compute_chunks( Array of shape (n_tiles_x, n_tiles_y, 4). Each entry defines a tile as (x, y, width, height). """ - x_positions, widths = _compute_chunk_sizes_positions(dimensions[1], chunk_size[1], min_coord=min_coordinates[1]) - y_positions, heights = _compute_chunk_sizes_positions(dimensions[0], chunk_size[0], min_coord=min_coordinates[0]) + x_positions, widths = _compute_chunk_sizes_positions(shape[1], chunk_size[1], min_coord=min_coordinates[1]) + y_positions, heights = _compute_chunk_sizes_positions(shape[0], chunk_size[0], min_coord=min_coordinates[0]) # Generate the tiles tiles = np.array( From 99731fedbc9502ddae055e9b26de7251ba5b52b7 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 2 May 2025 17:48:54 +0200 Subject: [PATCH 13/39] [Refactor] Make suggested simplification of code, suggested by @melonora --- src/spatialdata_io/readers/_utils/_image.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index d644f934..4621d5fb 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -11,10 +11,7 @@ def _compute_chunk_sizes_positions(size: int, chunk: int, min_coord: int) -> tup """Calculate chunk sizes and positions for a given dimension and chunk size""" # All chunks have the same size except for the last one positions = np.arange(min_coord, min_coord + size, chunk) - lengths = np.full_like(positions, chunk, dtype=int) - - if positions[-1] + chunk > size + min_coord: - lengths[-1] = size + min_coord - positions[-1] + lengths = np.minimum(chunk, min_coord + size - positions) return positions, lengths From f03ca8e8d3f867424b1a526020521c97846d3a68 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 2 May 2025 17:52:39 +0200 Subject: [PATCH 14/39] [Test] Add test for compressed tiffs --- tests/test_generic.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/test_generic.py b/tests/test_generic.py index e8414fa7..3251e8fd 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -36,19 +36,29 @@ def save_temp_files() -> Generator[tuple[Path, Path, Path], None, None]: yield jpg_path, geojson_path, Path(tmpdir) -@pytest.fixture(scope="module", params=[("c", "y", "x"), ("x", "y", "c")]) +@pytest.fixture( + scope="module", + params=[ + {"axes": ("c", "y", "x"), "compression": None}, + {"axes": ("x", "y", "c"), "compression": None}, + {"axes": ("c", "y", "x"), "compression": "lzw"}, + {"axes": ("x", "y", "c"), "compression": "lzw"}, + ], +) def save_tiff_files( request: pytest.FixtureRequest, ) -> Generator[tuple[Path, tuple[str], Path], None, None]: with tempfile.TemporaryDirectory() as tmpdir: - axes = request.param + axes = request.param["axes"] + compression = request.param["compression"] + sdata = blobs() # save the image as tiff x = sdata["blobs_image"].transpose(*axes).data.compute() img = np.clip(x * 255, 0, 255).astype(np.uint8) tiff_path = Path(tmpdir) / "blobs_image.tiff" - tiffwrite(tiff_path, img) + tiffwrite(tiff_path, img, compression=compression) yield tiff_path, axes, Path(tmpdir) From 9e057de33bcdf5072bf3627220d05d6f7b57efde Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 2 May 2025 18:24:24 +0200 Subject: [PATCH 15/39] [Fix] Account for compressed images --- src/spatialdata_io/readers/generic.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index fd8c17dd..9eb799dd 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -7,6 +7,7 @@ import dask.array as da import numpy as np +import tifffile from dask_image.imread import imread from geopandas import GeoDataFrame from numpy.typing import NDArray @@ -14,7 +15,6 @@ from spatialdata.models import Image2DModel, ShapesModel from spatialdata.models._utils import DEFAULT_COORDINATE_SYSTEM from spatialdata.transformations import Identity -from tifffile import memmap as tiffmmemap from xarray import DataArray from ._utils._image import _compute_chunks, _read_chunks @@ -81,7 +81,9 @@ def geojson(input: Path, coordinate_system: str) -> GeoDataFrame: return ShapesModel.parse(input, transformations={coordinate_system: Identity()}) -def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[DaskArray[np.int_]]]: +def _tiff_to_chunks( + input: Path, axes_dim_mapping: dict[str, int] +) -> DaskArray[np.int_] | list[list[DaskArray[np.int_]]]: """Chunkwise reader for tiff files. Parameters @@ -96,7 +98,7 @@ def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[ list[list[DaskArray]] """ # Lazy file reader - slide = tiffmmemap(input) + slide = tifffile.memmap(input) # Transpose to cyx order slide = np.transpose(slide, (axes_dim_mapping["c"], axes_dim_mapping["y"], axes_dim_mapping["x"])) @@ -123,8 +125,20 @@ def image(input: Path, data_axes: Sequence[str], coordinate_system: str) -> Data axes_dim_mapping = {axes: ndim for ndim, axes in enumerate(data_axes)} if input.suffix in [".tiff", ".tif"]: - chunks = _tiff_to_chunks(input, axes_dim_mapping=axes_dim_mapping) - image = da.block(chunks, allow_unknown_chunksizes=True) + try: + chunks = _tiff_to_chunks(input, axes_dim_mapping=axes_dim_mapping) + image = da.block(chunks, allow_unknown_chunksizes=True) + + # Edge case: Compressed images are not memory-mappable + except ValueError: + warnings.warn( + "Image data are not memory-mappable, potentially due to compression. Trying to load the image into memory at once", + stacklevel=2, + ) + image = imread(input) + if len(image.shape) == len(data_axes) + 1 and image.shape[0] == 1: + image = np.squeeze(image, axis=0) + image = image.transpose(axes_dim_mapping["c"], axes_dim_mapping["y"], axes_dim_mapping["x"]) elif input.suffix in [".png", ".jpg", ".jpeg"]: image = imread(input) From c05b71872151525f456399b8f56b55b006aba650 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 2 May 2025 18:31:59 +0200 Subject: [PATCH 16/39] [Refactor] Remove unnecessary type hint --- src/spatialdata_io/readers/generic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 9eb799dd..b632eba7 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -81,9 +81,7 @@ def geojson(input: Path, coordinate_system: str) -> GeoDataFrame: return ShapesModel.parse(input, transformations={coordinate_system: Identity()}) -def _tiff_to_chunks( - input: Path, axes_dim_mapping: dict[str, int] -) -> DaskArray[np.int_] | list[list[DaskArray[np.int_]]]: +def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[DaskArray[np.int_]]]: """Chunkwise reader for tiff files. Parameters From 9f3cc3ceba7f1dc1dadfaf258148d79da666a84f Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Mon, 12 Jan 2026 17:20:07 +0100 Subject: [PATCH 17/39] fix pre-commit --- src/spatialdata_io/readers/_utils/_image.py | 4 ++-- src/spatialdata_io/readers/generic.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 5993df30..9fab0dd0 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -61,9 +61,9 @@ def _read_chunks( slide: Any, coords: NDArray[np.int_], n_channel: int, - dtype: np.number, + dtype: np.dtype[Any], **func_kwargs: Any, -) -> list[list[da.array]]: +) -> list[list[da.Array]]: """Abstract method to tile a large microscopy image. Parameters diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 5aad2a5e..5f2affde 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -4,7 +4,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Protocol, TypeVar -import dask.array import dask.array as da import numpy as np import tifffile @@ -123,7 +122,7 @@ def _reader_func(slide: NDArray[np.int_], x0: int, y0: int, width: int, height: return _read_chunks(_reader_func, slide, coords=chunk_coords, n_channel=n_channel, dtype=slide.dtype) -def _dask_image_imread(input: Path, data_axes: Sequence[str]) -> DaskArray[int | float]: +def _dask_image_imread(input: Path, data_axes: Sequence[str]) -> da.Array: image = imread(input) if len(image.shape) == len(data_axes) + 1 and image.shape[0] == 1: image = np.squeeze(image, axis=0) From 9d44f25e01c1f0c9be9f3ba9f17f9564186a7317 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 13 Jan 2026 14:17:30 +0100 Subject: [PATCH 18/39] fix transpose in image(); wip code review --- src/spatialdata_io/readers/_utils/_image.py | 12 +++++++++++- src/spatialdata_io/readers/generic.py | 2 ++ tests/readers/test_utils_image.py | 1 + tests/test_generic.py | 3 ++- 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 9fab0dd0..12c76c0e 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -42,6 +42,7 @@ def _compute_chunks( Array of shape (n_tiles_x, n_tiles_y, 4). Each entry defines a tile as (x, y, width, height). """ + # TODO: check x -> 1 and y -> 0? x_positions, widths = _compute_chunk_sizes_positions(shape[1], chunk_size[1], min_coord=min_coordinates[1]) y_positions, heights = _compute_chunk_sizes_positions(shape[0], chunk_size[0], min_coord=min_coordinates[0]) @@ -57,6 +58,8 @@ def _compute_chunks( def _read_chunks( + # TODO: expand type hints for ... + # TODO: really only np.int_? Not float? Consider using np.number func: Callable[..., NDArray[np.int_]], slide: Any, coords: NDArray[np.int_], @@ -80,7 +83,7 @@ def _read_chunks( and should return the chunk as numpy array of shape (c, y, x) slide - Slide image in lazyly loaded format compatible with func + Slide image in lazily loaded format compatible with func coords Coordinates of the upper left corner of the image in format (n_row_x, n_row_y, 4) where the last dimension defines the rectangular tile in format (x, y, width, height). @@ -102,6 +105,7 @@ def _read_chunks( func_kwargs = func_kwargs if func_kwargs else {} # Collect each delayed chunk as item in list of list + # TODO: check, wasn't it x, y and not y, x? # Inner list becomes dim=-1 (cols/x) # Outer list becomes dim=-2 (rows/y) # see dask.array.block @@ -117,10 +121,16 @@ def _read_chunks( **func_kwargs, ), dtype=dtype, + # TODO: double check the [3, 2] with debugger shape=(n_channel, *coords[y, x, [3, 2]]), ) + # TODO: seems inconsistent with coords docstring for x in range(coords.shape[1]) ] + # TODO: seems inconsistent with coords docstring for y in range(coords.shape[0]) ] return chunks + + +# TODO: do a asv debugging for peak mem diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 5f2affde..412c2923 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -138,9 +138,11 @@ def image(input: Path, data_axes: Sequence[str], coordinate_system: str) -> Data try: chunks = _tiff_to_chunks(input, axes_dim_mapping=axes_dim_mapping) image = da.block(chunks, allow_unknown_chunksizes=True) + data_axes = ["c", "y", "x"] # Edge case: Compressed images are not memory-mappable except ValueError as e: + # TODO: change to logger warning warnings.warn( f"Exception occurred: {str(e)}\nPossible troubleshooting: image data are not memory-mappable, potentially due to compression. Trying to load the image into memory at once", stacklevel=2, diff --git a/tests/readers/test_utils_image.py b/tests/readers/test_utils_image.py index f1c58fa1..5aaabd39 100644 --- a/tests/readers/test_utils_image.py +++ b/tests/readers/test_utils_image.py @@ -13,6 +13,7 @@ [ (300, 100, 0, np.array([0, 100, 200]), np.array([100, 100, 100])), (300, 200, 0, np.array([0, 200]), np.array([200, 100])), + # TODO: why negative coordinates if used only for 0, 0? (300, 100, -100, np.array([-100, 0, 100]), np.array([100, 100, 100])), (300, 200, -100, np.array([-100, 100]), np.array([200, 100])), ], diff --git a/tests/test_generic.py b/tests/test_generic.py index ff4c1d53..31e1c19f 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -68,8 +68,9 @@ def test_read_tiff(save_tiff_files: tuple[Path, tuple[str], Path]) -> None: img = image(tiff_path, data_axes=axes, coordinate_system="global") reference = tiffread(tiff_path) + reference_cyx = reference.transpose(*[axes.index(ax) for ax in ["c", "y", "x"]]) - assert (img.compute() == reference).all() + assert (img.compute() == reference_cyx).all() @pytest.mark.parametrize("cli", [True, False]) From c14cb2226b0b3d75144b780097d52d5d3b19cd1e Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 13 Jan 2026 15:18:10 +0100 Subject: [PATCH 19/39] add test for dask-image fallback for compressed tiffs --- src/spatialdata_io/readers/generic.py | 11 ++++++----- tests/test_generic.py | 17 +++++++++++++---- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 412c2923..08ca2bdb 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -1,6 +1,5 @@ from __future__ import annotations -import warnings from pathlib import Path from typing import TYPE_CHECKING, Protocol, TypeVar @@ -10,6 +9,7 @@ from dask_image.imread import imread from geopandas import GeoDataFrame from spatialdata._docs import docstring_parameter +from spatialdata._logging import logger from spatialdata.models import Image2DModel, ShapesModel from spatialdata.models._utils import DEFAULT_COORDINATE_SYSTEM from spatialdata.transformations import Identity @@ -71,7 +71,7 @@ def generic( coordinate_system = DEFAULT_COORDINATE_SYSTEM if input.suffix in VALID_SHAPE_TYPES: if data_axes is not None: - warnings.warn("data_axes is not used for geojson files", UserWarning, stacklevel=2) + logger.warning("data_axes is not used for geojson files") return geojson(input, coordinate_system=coordinate_system) elif input.suffix in VALID_IMAGE_TYPES: if data_axes is None: @@ -143,9 +143,10 @@ def image(input: Path, data_axes: Sequence[str], coordinate_system: str) -> Data # Edge case: Compressed images are not memory-mappable except ValueError as e: # TODO: change to logger warning - warnings.warn( - f"Exception occurred: {str(e)}\nPossible troubleshooting: image data are not memory-mappable, potentially due to compression. Trying to load the image into memory at once", - stacklevel=2, + logger.warning( + f"Exception occurred: {str(e)}\nPossible troubleshooting: image data " + "is not memory-mappable, potentially due to compression. Trying to " + "load the image into memory at once", ) image = _dask_image_imread(input=input, data_axes=data_axes) diff --git a/tests/test_generic.py b/tests/test_generic.py index 31e1c19f..445200dd 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1,3 +1,4 @@ +import logging import tempfile from collections.abc import Generator from contextlib import contextmanager @@ -8,6 +9,7 @@ from click.testing import CliRunner from PIL import Image from spatialdata import SpatialData +from spatialdata._logging import logger from spatialdata.datasets import blobs from tifffile import imread as tiffread from tifffile import imwrite as tiffwrite @@ -60,12 +62,19 @@ def save_tiff_files( tiff_path = Path(tmpdir) / "blobs_image.tiff" tiffwrite(tiff_path, img, compression=compression) - yield tiff_path, axes, Path(tmpdir) + yield tiff_path, axes, compression -def test_read_tiff(save_tiff_files: tuple[Path, tuple[str], Path]) -> None: - tiff_path, axes, _ = save_tiff_files - img = image(tiff_path, data_axes=axes, coordinate_system="global") +def test_read_tiff(save_tiff_files: tuple[Path, tuple[str], str | None], caplog: pytest.LogCaptureFixture) -> None: + tiff_path, axes, compression = save_tiff_files + + logger.propagate = True + with caplog.at_level(logging.WARNING): + img = image(tiff_path, data_axes=axes, coordinate_system="global") + if compression is not None: + assert "image data is not memory-mappable, potentially due to compression" in caplog.text + + logger.propagate = False reference = tiffread(tiff_path) reference_cyx = reference.transpose(*[axes.index(ax) for ax in ["c", "y", "x"]]) From a7a2b923ebe5853a6f7f047285057abb9f50b757 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 13 Jan 2026 15:43:06 +0100 Subject: [PATCH 20/39] remove unused min_coordinate --- src/spatialdata_io/readers/_utils/_image.py | 13 +++++------ src/spatialdata_io/readers/generic.py | 2 +- tests/readers/test_utils_image.py | 25 +++++---------------- 3 files changed, 12 insertions(+), 28 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 12c76c0e..4bab4038 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -7,11 +7,11 @@ from numpy.typing import NDArray -def _compute_chunk_sizes_positions(size: int, chunk: int, min_coord: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: +def _compute_chunk_sizes_positions(size: int, chunk: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: """Calculate chunk sizes and positions for a given dimension and chunk size.""" # All chunks have the same size except for the last one - positions = np.arange(min_coord, min_coord + size, chunk) - lengths = np.minimum(chunk, min_coord + size - positions) + positions = np.arange(0, size, chunk) + lengths = np.minimum(chunk, size - positions) return positions, lengths @@ -19,7 +19,6 @@ def _compute_chunk_sizes_positions(size: int, chunk: int, min_coord: int) -> tup def _compute_chunks( shape: tuple[int, int], chunk_size: tuple[int, int], - min_coordinates: tuple[int, int] = (0, 0), ) -> NDArray[np.int_]: """Create all chunk specs for a given image and chunk size. @@ -33,8 +32,6 @@ def _compute_chunks( Size of the image in (width, height). chunk_size : tuple[int, int] Size of individual tiles in (width, height). - min_coordinates : tuple[int, int], optional - Minimum coordinates (x, y) in the image, defaults to (0, 0). Returns ------- @@ -43,8 +40,8 @@ def _compute_chunks( as (x, y, width, height). """ # TODO: check x -> 1 and y -> 0? - x_positions, widths = _compute_chunk_sizes_positions(shape[1], chunk_size[1], min_coord=min_coordinates[1]) - y_positions, heights = _compute_chunk_sizes_positions(shape[0], chunk_size[0], min_coord=min_coordinates[0]) + x_positions, widths = _compute_chunk_sizes_positions(shape[1], chunk_size[1]) + y_positions, heights = _compute_chunk_sizes_positions(shape[0], chunk_size[0]) # Generate the tiles tiles = np.array( diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 08ca2bdb..668cfc56 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -113,7 +113,7 @@ def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[ n_channel = slide.shape[0] # Compute chunk coords - chunk_coords = _compute_chunks(slide_dimensions, chunk_size=DEFAULT_CHUNKSIZE, min_coordinates=(0, 0)) + chunk_coords = _compute_chunks(slide_dimensions, chunk_size=DEFAULT_CHUNKSIZE) # Define reader func def _reader_func(slide: NDArray[np.int_], x0: int, y0: int, width: int, height: int) -> NDArray[np.int_]: diff --git a/tests/readers/test_utils_image.py b/tests/readers/test_utils_image.py index 5aaabd39..bdf5f411 100644 --- a/tests/readers/test_utils_image.py +++ b/tests/readers/test_utils_image.py @@ -9,58 +9,45 @@ @pytest.mark.parametrize( - ("size", "chunk", "min_coordinate", "positions", "lengths"), + ("size", "chunk", "positions", "lengths"), [ - (300, 100, 0, np.array([0, 100, 200]), np.array([100, 100, 100])), - (300, 200, 0, np.array([0, 200]), np.array([200, 100])), - # TODO: why negative coordinates if used only for 0, 0? - (300, 100, -100, np.array([-100, 0, 100]), np.array([100, 100, 100])), - (300, 200, -100, np.array([-100, 100]), np.array([200, 100])), + (300, 100, np.array([0, 100, 200]), np.array([100, 100, 100])), + (300, 200, np.array([0, 200]), np.array([200, 100])), ], ) def test_compute_chunk_sizes_positions( size: int, chunk: int, - min_coordinate: int, positions: NDArray[np.number], lengths: NDArray[np.number], ) -> None: - computed_positions, computed_lengths = _compute_chunk_sizes_positions(size, chunk, min_coordinate) + computed_positions, computed_lengths = _compute_chunk_sizes_positions(size, chunk) assert (positions == computed_positions).all() assert (lengths == computed_lengths).all() @pytest.mark.parametrize( - ("dimensions", "chunk_size", "min_coordinates", "result"), + ("dimensions", "chunk_size", "result"), [ # Regular grid 2x2 ( (2, 2), (1, 1), - (0, 0), np.array([[[0, 0, 1, 1], [1, 0, 1, 1]], [[0, 1, 1, 1], [1, 1, 1, 1]]]), ), # Different tile sizes ( (3, 3), (2, 2), - (0, 0), np.array([[[0, 0, 2, 2], [2, 0, 1, 2]], [[0, 2, 2, 1], [2, 2, 1, 1]]]), ), - ( - (2, 2), - (1, 1), - (-1, 0), - np.array([[[0, -1, 1, 1], [1, -1, 1, 1]], [[0, 0, 1, 1], [1, 0, 1, 1]]]), - ), ], ) def test_compute_chunks( dimensions: tuple[int, int], chunk_size: tuple[int, int], - min_coordinates: tuple[int, int], result: NDArray[np.number], ) -> None: - tiles = _compute_chunks(dimensions, chunk_size, min_coordinates) + tiles = _compute_chunks(dimensions, chunk_size) assert (tiles == result).all() From 80a931d036c4fe87059477f352e3ab0f0e88cd92 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 13 Jan 2026 16:03:12 +0100 Subject: [PATCH 21/39] fix wrong dimension _compute_chunks(); cover with test --- src/spatialdata_io/readers/_utils/_image.py | 9 +++---- tests/readers/test_utils_image.py | 29 ++++++++++++++------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 4bab4038..11d18e8e 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -39,15 +39,14 @@ def _compute_chunks( Array of shape (n_tiles_x, n_tiles_y, 4). Each entry defines a tile as (x, y, width, height). """ - # TODO: check x -> 1 and y -> 0? - x_positions, widths = _compute_chunk_sizes_positions(shape[1], chunk_size[1]) - y_positions, heights = _compute_chunk_sizes_positions(shape[0], chunk_size[0]) + x_positions, widths = _compute_chunk_sizes_positions(shape[0], chunk_size[0]) + y_positions, heights = _compute_chunk_sizes_positions(shape[1], chunk_size[1]) # Generate the tiles tiles = np.array( [ - [[x, y, w, h] for x, w in zip(x_positions, widths, strict=True)] - for y, h in zip(y_positions, heights, strict=True) + [[x, y, w, h] for y, h in zip(y_positions, heights, strict=True)] + for x, w in zip(x_positions, widths, strict=True) ], dtype=int, ) diff --git a/tests/readers/test_utils_image.py b/tests/readers/test_utils_image.py index bdf5f411..b6648556 100644 --- a/tests/readers/test_utils_image.py +++ b/tests/readers/test_utils_image.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize( - ("size", "chunk", "positions", "lengths"), + ("size", "chunk", "expected_positions", "expected_lengths"), [ (300, 100, np.array([0, 100, 200]), np.array([100, 100, 100])), (300, 200, np.array([0, 200]), np.array([200, 100])), @@ -18,12 +18,12 @@ def test_compute_chunk_sizes_positions( size: int, chunk: int, - positions: NDArray[np.number], - lengths: NDArray[np.number], + expected_positions: NDArray[np.number], + expected_lengths: NDArray[np.number], ) -> None: computed_positions, computed_lengths = _compute_chunk_sizes_positions(size, chunk) - assert (positions == computed_positions).all() - assert (lengths == computed_lengths).all() + assert (expected_positions == computed_positions).all() + assert (expected_lengths == computed_lengths).all() @pytest.mark.parametrize( @@ -33,13 +33,24 @@ def test_compute_chunk_sizes_positions( ( (2, 2), (1, 1), - np.array([[[0, 0, 1, 1], [1, 0, 1, 1]], [[0, 1, 1, 1], [1, 1, 1, 1]]]), + np.array( + [ + [[0, 0, 1, 1], [0, 1, 1, 1]], + [[1, 0, 1, 1], [1, 1, 1, 1]], + ] + ), ), # Different tile sizes ( - (3, 3), - (2, 2), - np.array([[[0, 0, 2, 2], [2, 0, 1, 2]], [[0, 2, 2, 1], [2, 2, 1, 1]]]), + (300, 300), + (100, 200), + np.array( + [ + [[0, 0, 100, 200], [0, 200, 100, 100]], + [[100, 0, 100, 200], [100, 200, 100, 100]], + [[200, 0, 100, 200], [200, 200, 100, 100]], + ] + ), ), ], ) From fc26342c25dbf407385e75f4437f3e86a10a67b5 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 13 Jan 2026 16:09:10 +0100 Subject: [PATCH 22/39] np._int -> np.number --- src/spatialdata_io/readers/_utils/_image.py | 3 +-- src/spatialdata_io/readers/generic.py | 4 ++-- tests/test_generic.py | 3 +-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 11d18e8e..37676833 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -55,8 +55,7 @@ def _compute_chunks( def _read_chunks( # TODO: expand type hints for ... - # TODO: really only np.int_? Not float? Consider using np.number - func: Callable[..., NDArray[np.int_]], + func: Callable[..., NDArray[np.number]], slide: Any, coords: NDArray[np.int_], n_channel: int, diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 668cfc56..45fb2a02 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -86,7 +86,7 @@ def geojson(input: Path, coordinate_system: str) -> GeoDataFrame: return ShapesModel.parse(input, transformations={coordinate_system: Identity()}) -def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[DaskArray[np.int_]]]: +def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[DaskArray[np.number]]]: """Chunkwise reader for tiff files. Parameters @@ -116,7 +116,7 @@ def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[ chunk_coords = _compute_chunks(slide_dimensions, chunk_size=DEFAULT_CHUNKSIZE) # Define reader func - def _reader_func(slide: NDArray[np.int_], x0: int, y0: int, width: int, height: int) -> NDArray[np.int_]: + def _reader_func(slide: NDArray[np.number], x0: int, y0: int, width: int, height: int) -> NDArray[np.number]: return np.array(slide[:, y0 : y0 + height, x0 : x0 + width]) return _read_chunks(_reader_func, slide, coords=chunk_coords, n_channel=n_channel, dtype=slide.dtype) diff --git a/tests/test_generic.py b/tests/test_generic.py index 445200dd..33108e98 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -57,10 +57,9 @@ def save_tiff_files( sdata = blobs() # save the image as tiff x = sdata["blobs_image"].transpose(*axes).data.compute() - img = np.clip(x * 255, 0, 255).astype(np.uint8) tiff_path = Path(tmpdir) / "blobs_image.tiff" - tiffwrite(tiff_path, img, compression=compression) + tiffwrite(tiff_path, x, compression=compression) yield tiff_path, axes, compression From 3b296da258afc49c53c01febea86dce31043148b Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 13 Jan 2026 17:14:56 +0100 Subject: [PATCH 23/39] fix indices in _read_chunks() --- src/spatialdata_io/readers/_utils/_image.py | 32 ++++++++++++--------- src/spatialdata_io/readers/generic.py | 3 +- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 37676833..56ee7f39 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -54,7 +54,6 @@ def _compute_chunks( def _read_chunks( - # TODO: expand type hints for ... func: Callable[..., NDArray[np.number]], slide: Any, coords: NDArray[np.int_], @@ -94,13 +93,23 @@ def _read_chunks( Returns ------- list[list[da.array]] - List (length: n_row_x) of lists (length: n_row_y) of chunks. - Represents all chunks of the full image. + (Outer) list (length: n_row_y) of (inner) lists (length: n_row_x) of chunks with axes + (c, y, x). Represents all chunks of the full image. + + Notes + ------- + As seen in _compute_chunks(), since coords are in format (x, y, width, height), the + inner list there (dim=-1) runs over the y values and the outer list (dim=-2) runs + over the x values. In _read_chunks() we have the more common (y, x) format, where + the inner list (dim=-1) runs over the x values and the outer list (dim=-2) runs over + the y values. + + The above can be confusing, and a way to address this is to define coords to be + in format (y, x, height, width) instead of (x, y, width, height). """ func_kwargs = func_kwargs if func_kwargs else {} # Collect each delayed chunk as item in list of list - # TODO: check, wasn't it x, y and not y, x? # Inner list becomes dim=-1 (cols/x) # Outer list becomes dim=-2 (rows/y) # see dask.array.block @@ -109,21 +118,18 @@ def _read_chunks( da.from_delayed( delayed(func)( slide, - x0=coords[y, x, 0], - y0=coords[y, x, 1], - width=coords[y, x, 2], - height=coords[y, x, 3], + x0=coords[x, y, 0], + y0=coords[x, y, 1], + width=coords[x, y, 2], + height=coords[x, y, 3], **func_kwargs, ), dtype=dtype, - # TODO: double check the [3, 2] with debugger shape=(n_channel, *coords[y, x, [3, 2]]), ) - # TODO: seems inconsistent with coords docstring - for x in range(coords.shape[1]) + for x in range(coords.shape[0]) ] - # TODO: seems inconsistent with coords docstring - for y in range(coords.shape[0]) + for y in range(coords.shape[1]) ] return chunks diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 45fb2a02..1fef307d 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -116,7 +116,7 @@ def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[ chunk_coords = _compute_chunks(slide_dimensions, chunk_size=DEFAULT_CHUNKSIZE) # Define reader func - def _reader_func(slide: NDArray[np.number], x0: int, y0: int, width: int, height: int) -> NDArray[np.number]: + def _reader_func(slide: np.memmap, x0: int, y0: int, width: int, height: int) -> NDArray[np.number]: return np.array(slide[:, y0 : y0 + height, x0 : x0 + width]) return _read_chunks(_reader_func, slide, coords=chunk_coords, n_channel=n_channel, dtype=slide.dtype) @@ -142,7 +142,6 @@ def image(input: Path, data_axes: Sequence[str], coordinate_system: str) -> Data # Edge case: Compressed images are not memory-mappable except ValueError as e: - # TODO: change to logger warning logger.warning( f"Exception occurred: {str(e)}\nPossible troubleshooting: image data " "is not memory-mappable, potentially due to compression. Trying to " From f7ad81aab9e6f4682fc5523d6038f7579ae26a2a Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 13 Jan 2026 17:22:02 +0100 Subject: [PATCH 24/39] better english --- src/spatialdata_io/readers/_utils/_image.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 56ee7f39..7909de63 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -98,14 +98,14 @@ def _read_chunks( Notes ------- - As seen in _compute_chunks(), since coords are in format (x, y, width, height), the - inner list there (dim=-1) runs over the y values and the outer list (dim=-2) runs - over the x values. In _read_chunks() we have the more common (y, x) format, where - the inner list (dim=-1) runs over the x values and the outer list (dim=-2) runs over - the y values. - - The above can be confusing, and a way to address this is to define coords to be - in format (y, x, height, width) instead of (x, y, width, height). + As shown in `_compute_chunks()`, `coords` are in the form `(x, y, + width, height)`. In that function, the inner list (dim = -1) iterates over `y` + values, and the outer list (dim = -2) iterates over `x` values. In `_read_chunks( + )`, we use the more common `(y, x)` ordering: the inner list (dim = -1) iterates + over `x` values, and the outer list (dim = -2) iterates over `y` values. + + This mismatch can be confusing. A straightforward fix is to standardize `coords` + to `(y, x, height, width)` instead of `(x, y, width, height)`. """ func_kwargs = func_kwargs if func_kwargs else {} From 7f5b7ce9e3716d185b8909da9e28b21b4ed37b9f Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 13 Jan 2026 17:23:03 +0100 Subject: [PATCH 25/39] better docstring --- src/spatialdata_io/readers/_utils/_image.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 7909de63..07f4026e 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -104,9 +104,12 @@ def _read_chunks( )`, we use the more common `(y, x)` ordering: the inner list (dim = -1) iterates over `x` values, and the outer list (dim = -2) iterates over `y` values. - This mismatch can be confusing. A straightforward fix is to standardize `coords` - to `(y, x, height, width)` instead of `(x, y, width, height)`. + This mismatch can be confusing. A straightforward fix that one could perform would + be to standardize `coords` to `(y, x, height, width)` instead of + `(x, y, width, height)`. """ + # TODO: standardize `coords` as explained in the docstring above, then remove that + # part from the docstring func_kwargs = func_kwargs if func_kwargs else {} # Collect each delayed chunk as item in list of list From 0c482acf1bd331c2a84cd9b19dc47a98f35d18e6 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 13 Jan 2026 18:02:39 +0100 Subject: [PATCH 26/39] wip benchmark (bugs) --- .gitignore | 1 + asv.conf.json | 2 +- benchmarks/benchmark_image.py | 97 +++++++++++++++++++ .../{bench_xenium.py => benchmark_xenium.py} | 17 ++-- src/spatialdata_io/readers/generic.py | 66 ++++++++++--- 5 files changed, 159 insertions(+), 24 deletions(-) create mode 100644 benchmarks/benchmark_image.py rename benchmarks/{bench_xenium.py => benchmark_xenium.py} (86%) diff --git a/.gitignore b/.gitignore index 379f16be..19b9d658 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,4 @@ data data/ tests/data uv.lock +.asv/ diff --git a/asv.conf.json b/asv.conf.json index 516f0f1f..55f61b90 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -3,7 +3,7 @@ "project": "spatialdata-io", "project_url": "https://github.com/scverse/spatialdata-io", "repo": ".", - "branches": ["main", "xenium-labels-dask", "xenium-labels-dask-zipstore"], + "branches": ["image-reader-chunkwise"], "dvcs": "git", "environment_type": "virtualenv", "pythons": ["3.12"], diff --git a/benchmarks/benchmark_image.py b/benchmarks/benchmark_image.py new file mode 100644 index 00000000..5b83f524 --- /dev/null +++ b/benchmarks/benchmark_image.py @@ -0,0 +1,97 @@ +"""Benchmarks for SpatialData IO operations for large images. + +Instructions: + See benchmark_xenium.py for instructions. +""" + +import shutil +from pathlib import Path + +from spatialdata import SpatialData +from xarray import DataArray + +from spatialdata_io import image # type: ignore[attr-defined] + +# ============================================================================= +# CONFIGURATION - Edit these paths to match your setup +# ============================================================================= +SANDBOX_DIR = Path(__file__).parent.parent.parent / "spatialdata-sandbox" +DATASET = "xenium_2.0.0_io" +# ============================================================================= + + +def get_paths() -> tuple[Path, Path]: + """Get paths for benchmark data.""" + path = SANDBOX_DIR / DATASET + # TODO: this is not a good image because it's compressed (not memmappable) + path_read = path / "data" / "morphology.ome.tif" + path_write = path / "data_benchmark.zarr" + + if not path_read.exists(): + raise ValueError(f"Data directory not found: {path_read}") + + return path_read, path_write + + +class IOBenchmarkImage: + """Benchmark IO read operations with different parameter combinations.""" + + timeout = 3600 + repeat = 3 + number = 1 + warmup_time = 0 + processes = 1 + + # Parameter combinations: scale_factors, use_tiff_memmap, chunks + params = [ + [None, [2, 2, 2]], # scale_factors + [True, False], # use_tiff_memmap + [(5000, 5000), (1000, 1000)], # chunks + ] + param_names = ["scale_factors", "use_tiff_memmap", "chunks"] + + def setup(self, *_) -> None: + """Set up paths for benchmarking.""" + self.path_read, self.path_write = get_paths() + if self.path_write.exists(): + shutil.rmtree(self.path_write) + + def _convert_image(self, scale_factors, use_tiff_memmap, chunks) -> SpatialData: + """Read image data with specified parameters.""" + im = image( + input=self.path_read, + data_axes=("c", "y", "x"), + coordinate_system="global", + use_tiff_memmap=use_tiff_memmap, + chunks=chunks, + scale_factors=scale_factors, + ) + sdata = SpatialData.init_from_elements({"image": im}) + # sanity check + if scale_factors is None: + assert isinstance(sdata["image"], DataArray) + else: + assert len(sdata["image"].keys()) == len(scale_factors) + + if chunks is not None: + # TODO: bug here! + assert sdata["image"].chunksizes["x"] == chunks[0] + assert sdata["image"].chunksizes["y"] == chunks[1] + return sdata + + def time_io(self, scale_factors, use_tiff_memmap, chunks) -> None: + """Walltime for data parsing.""" + sdata = self._convert_image(scale_factors, use_tiff_memmap, chunks) + sdata.write(self.path_write) + + def peakmem_io(self, scale_factors, use_tiff_memmap, chunks) -> None: + """Peak memory for data parsing.""" + sdata = self._convert_image(scale_factors, use_tiff_memmap, chunks) + sdata.write(self.path_write) + + +if __name__ == "__main__": + # Run a single test case for quick verification + bench = IOBenchmarkImage() + bench.setup(None, True, (1000, 1000)) + bench.time_io(None, True, (1000, 1000)) diff --git a/benchmarks/bench_xenium.py b/benchmarks/benchmark_xenium.py similarity index 86% rename from benchmarks/bench_xenium.py rename to benchmarks/benchmark_xenium.py index 685e5094..22d70888 100644 --- a/benchmarks/bench_xenium.py +++ b/benchmarks/benchmark_xenium.py @@ -11,18 +11,18 @@ cd /path/to/spatialdata-io # Quick benchmark (single run, for testing): - asv run --python=same -b IOBenchmark --quick --show-stderr -v + asv run --python=same -b IOBenchmarkXenium --quick --show-stderr -v # Full benchmark (multiple runs, for accurate results): - asv run --python=same -b IOBenchmark --show-stderr -v + asv run --python=same -b IOBenchmarkXenium --show-stderr -v Comparing branches: # Run on specific commits: - asv run main^! -b IOBenchmark --show-stderr -v - asv run xenium-labels-dask^! -b IOBenchmark --show-stderr -v + asv run main^! -b IOBenchmarkXenium --show-stderr -v + asv run xenium-labels-dask^! -b IOBenchmarkXenium --show-stderr -v # Or compare two branches directly: - asv continuous main xenium-labels-dask -b IOBenchmark --show-stderr -v + asv continuous main xenium-labels-dask -b IOBenchmarkXenium --show-stderr -v # View comparison: asv compare main xenium-labels-dask @@ -36,7 +36,6 @@ import inspect import shutil from pathlib import Path -from typing import TYPE_CHECKING from spatialdata import SpatialData @@ -62,9 +61,7 @@ def get_paths() -> tuple[Path, Path]: return path_read, path_write -class IOBenchmark: - """Benchmark IO read operations.""" - +class IOBenchmarkXenium: timeout = 3600 repeat = 3 number = 1 @@ -106,4 +103,4 @@ def peakmem_io(self) -> None: if __name__ == "__main__": - IOBenchmark().time_io() + IOBenchmarkXenium().time_io() diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 1fef307d..79e33fac 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -25,7 +25,7 @@ VALID_IMAGE_TYPES = [".tif", ".tiff", ".png", ".jpg", ".jpeg"] VALID_SHAPE_TYPES = [".geojson"] -DEFAULT_CHUNKSIZE = (1000, 1000) +DEFAULT_CHUNK_SIZE = (1000, 1000) __all__ = ["generic", "geojson", "image", "VALID_IMAGE_TYPES", "VALID_SHAPE_TYPES"] @@ -86,7 +86,11 @@ def geojson(input: Path, coordinate_system: str) -> GeoDataFrame: return ShapesModel.parse(input, transformations={coordinate_system: Identity()}) -def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[DaskArray[np.number]]]: +def _tiff_to_chunks( + input: Path, + axes_dim_mapping: dict[str, int], + chunk_size: tuple[int, int] | None = None, +) -> list[list[DaskArray[np.number]]]: """Chunkwise reader for tiff files. Parameters @@ -95,11 +99,16 @@ def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[ Path to image axes_dim_mapping Mapping between dimension name (x, y, c) and index + chunk_size + Chunk size in (x, y) order. If None, defaults to DEFAULT_CHUNK_SIZE. Returns ------- list[list[DaskArray]] """ + if chunk_size is None: + chunk_size = DEFAULT_CHUNK_SIZE + # Lazy file reader slide = tifffile.memmap(input) @@ -113,7 +122,7 @@ def _tiff_to_chunks(input: Path, axes_dim_mapping: dict[str, int]) -> list[list[ n_channel = slide.shape[0] # Compute chunk coords - chunk_coords = _compute_chunks(slide_dimensions, chunk_size=DEFAULT_CHUNKSIZE) + chunk_coords = _compute_chunks(slide_dimensions, chunk_size=chunk_size) # Define reader func def _reader_func(slide: np.memmap, x0: int, y0: int, width: int, height: int) -> NDArray[np.number]: @@ -129,15 +138,43 @@ def _dask_image_imread(input: Path, data_axes: Sequence[str]) -> da.Array: return image -def image(input: Path, data_axes: Sequence[str], coordinate_system: str) -> DataArray: - """Reads an image file and returns a parsed Image2D spatial element.""" +def image( + input: Path, + data_axes: Sequence[str], + coordinate_system: str, + use_tiff_memmap: bool = True, + chunks: tuple[int, int] | None = None, + scale_factors: Sequence[int] | None = None, +) -> DataArray: + """Reads an image file and returns a parsed Image2D spatial element. + + Parameters + ---------- + input + Path to the image file. + data_axes + Axes of the data. + coordinate_system + Coordinate system of the spatial element. + use_tiff_memmap + Whether to use memory-mapped reading for TIFF files. + chunks + Chunk size in (x, y) order for TIFF files. If None, defaults to (1000, 1000). + scale_factors + Scale factors for building a multiscale image pyramid. Passed to Image2DModel.parse(). + + Returns + ------- + Parsed Image2D spatial element. + """ # Map passed data axes to position of dimension axes_dim_mapping = {axes: ndim for ndim, axes in enumerate(data_axes)} - if input.suffix in [".tiff", ".tif"]: + im = None + if input.suffix in [".tiff", ".tif"] and use_tiff_memmap: try: - chunks = _tiff_to_chunks(input, axes_dim_mapping=axes_dim_mapping) - image = da.block(chunks, allow_unknown_chunksizes=True) + im_chunks = _tiff_to_chunks(input, axes_dim_mapping=axes_dim_mapping, chunk_size=chunks) + im = da.block(im_chunks, allow_unknown_chunksizes=True) data_axes = ["c", "y", "x"] # Edge case: Compressed images are not memory-mappable @@ -147,11 +184,14 @@ def image(input: Path, data_axes: Sequence[str], coordinate_system: str) -> Data "is not memory-mappable, potentially due to compression. Trying to " "load the image into memory at once", ) - image = _dask_image_imread(input=input, data_axes=data_axes) + use_tiff_memmap = False - elif input.suffix in [".png", ".jpg", ".jpeg"]: - image = _dask_image_imread(input=input, data_axes=data_axes) - else: + if input.suffix in [".tiff", ".tif"] and not use_tiff_memmap or input.suffix in [".png", ".jpg", ".jpeg"]: + im = _dask_image_imread(input=input, data_axes=data_axes) + + if im is None: raise NotImplementedError(f"File format {input.suffix} not implemented") - return Image2DModel.parse(image, dims=data_axes, transformations={coordinate_system: Identity()}) + return Image2DModel.parse( + im, dims=data_axes, transformations={coordinate_system: Identity()}, scale_factors=scale_factors + ) From 14665967a82743225eb2d483f9917224f6345699 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 16 Jan 2026 09:27:57 +0100 Subject: [PATCH 27/39] [Test] Use small assymetric chunk sizes to capture any issues with the computation order of chunks/assembly --- tests/test_generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_generic.py b/tests/test_generic.py index 33108e98..341e92d2 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -69,7 +69,7 @@ def test_read_tiff(save_tiff_files: tuple[Path, tuple[str], str | None], caplog: logger.propagate = True with caplog.at_level(logging.WARNING): - img = image(tiff_path, data_axes=axes, coordinate_system="global") + img = image(tiff_path, data_axes=axes, chunks=(29, 71), coordinate_system="global") if compression is not None: assert "image data is not memory-mappable, potentially due to compression" in caplog.text From a2930d51e5177d3d8f206ce691132edec175108c Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 16 Jan 2026 10:16:07 +0100 Subject: [PATCH 28/39] Add comment to clarify use of asymmetric chunk sizes in test_read_tiff --- tests/test_generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_generic.py b/tests/test_generic.py index 341e92d2..374d96e2 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -69,6 +69,7 @@ def test_read_tiff(save_tiff_files: tuple[Path, tuple[str], str | None], caplog: logger.propagate = True with caplog.at_level(logging.WARNING): + # Use asymmetric chunk sizes to catch errors with the ordering of chunk dimensions and the assembly of the individual chunks img = image(tiff_path, data_axes=axes, chunks=(29, 71), coordinate_system="global") if compression is not None: assert "image data is not memory-mappable, potentially due to compression" in caplog.text From a9c2a2bd0b038878835377674800cbee0015e43e Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 16 Jan 2026 10:23:53 +0100 Subject: [PATCH 29/39] Follow standard convention of image dimensions throughout reader function. Switch axes order everywhere from (x, y) to (y, x) --- src/spatialdata_io/readers/generic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 79e33fac..aa126464 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -98,9 +98,9 @@ def _tiff_to_chunks( input Path to image axes_dim_mapping - Mapping between dimension name (x, y, c) and index + Mapping between dimension name (c, y, x) and index chunk_size - Chunk size in (x, y) order. If None, defaults to DEFAULT_CHUNK_SIZE. + Chunk size in (y, x) order. If None, defaults to DEFAULT_CHUNK_SIZE. Returns ------- @@ -115,8 +115,8 @@ def _tiff_to_chunks( # Transpose to cyx order slide = np.transpose(slide, (axes_dim_mapping["c"], axes_dim_mapping["y"], axes_dim_mapping["x"])) - # Get dimensions in (x, y) - slide_dimensions = slide.shape[2], slide.shape[1] + # Get dimensions in (y, x) + slide_dimensions = slide.shape[1], slide.shape[2] # Get number of channels (c) n_channel = slide.shape[0] @@ -125,7 +125,7 @@ def _tiff_to_chunks( chunk_coords = _compute_chunks(slide_dimensions, chunk_size=chunk_size) # Define reader func - def _reader_func(slide: np.memmap, x0: int, y0: int, width: int, height: int) -> NDArray[np.number]: + def _reader_func(slide: np.memmap, y0: int, x0: int, height: int, width: int) -> NDArray[np.number]: return np.array(slide[:, y0 : y0 + height, x0 : x0 + width]) return _read_chunks(_reader_func, slide, coords=chunk_coords, n_channel=n_channel, dtype=slide.dtype) @@ -159,7 +159,7 @@ def image( use_tiff_memmap Whether to use memory-mapped reading for TIFF files. chunks - Chunk size in (x, y) order for TIFF files. If None, defaults to (1000, 1000). + Chunk size in (y, x) order for TIFF files. If None, defaults to (1000, 1000). scale_factors Scale factors for building a multiscale image pyramid. Passed to Image2DModel.parse(). From 2dabbf9f241bd93784a4e1553e93ac3031b188a4 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 16 Jan 2026 10:31:18 +0100 Subject: [PATCH 30/39] Refactor chunk specification to follow (y, x) order and update related documentation 1. Enforce standard dimension order convention (y, x). 2. Change local variable names to better distinguish between chunk-level coordinates and pixel-level coordinates - All chunk indices are indicated as such with the prefix. - The pixel-coordinates of individual chunks are now consistently named (y, x, height, width) --- src/spatialdata_io/readers/_utils/_image.py | 57 +++++++++++---------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 07f4026e..320ae69c 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -22,31 +22,34 @@ def _compute_chunks( ) -> NDArray[np.int_]: """Create all chunk specs for a given image and chunk size. - Creates specifications (x, y, width, height) with (x, y) being the upper left corner + For each chunk in the final image with the chunk coords (block row=chunk_y_position, block col=chunk_x_position), this function + creates specifications (y, x, height, width) with (y, x) being the upper left corner of chunks of size chunk_size. Chunks at the edges correspond to the remainder of chunk size and dimensions Parameters ---------- shape : tuple[int, int] - Size of the image in (width, height). + Size of the image in (image height, image width). chunk_size : tuple[int, int] - Size of individual tiles in (width, height). + Size of individual tiles in (height, width). Returns ------- np.ndarray - Array of shape (n_tiles_x, n_tiles_y, 4). Each entry defines a tile - as (x, y, width, height). + Array of shape (n_tiles_y, n_tiles_x, 4). Each entry defines a tile + as (y, x, height, width). """ - x_positions, widths = _compute_chunk_sizes_positions(shape[0], chunk_size[0]) - y_positions, heights = _compute_chunk_sizes_positions(shape[1], chunk_size[1]) + y_positions, heights = _compute_chunk_sizes_positions(shape[0], chunk_size[0]) + x_positions, widths = _compute_chunk_sizes_positions(shape[1], chunk_size[1]) # Generate the tiles + # Each entry defines the chunk dimensions for a tile + # The order of the chunk definitions (chunk_index_y=outer, chunk_index_x=inner) follows the dask.block convention tiles = np.array( [ - [[x, y, w, h] for y, h in zip(y_positions, heights, strict=True)] - for x, w in zip(x_positions, widths, strict=True) + [[y, x, h, w] for x, w in zip(x_positions, widths, strict=True)] + for y, h in zip(y_positions, heights, strict=True) ], dtype=int, ) @@ -66,23 +69,23 @@ def _read_chunks( Parameters ---------- func - Function to retrieve a rectangular tile from the slide image. Must take the + Function to retrieve a single rectangular tile from the slide image. Must take the arguments: - slide Full slide image - - x0: x (col) coordinate of upper left corner of chunk - y0: y (row) coordinate of upper left corner of chunk - - width: Width of chunk + - x0: x (col) coordinate of upper left corner of chunk - height: Height of chunk + - width: Width of chunk and should return the chunk as numpy array of shape (c, y, x) slide - Slide image in lazily loaded format compatible with func + Slide image in lazily loaded format compatible with `func` coords - Coordinates of the upper left corner of the image in format (n_row_x, n_row_y, 4) - where the last dimension defines the rectangular tile in format (x, y, width, height). - n_row_x represents the number of chunks in x dimension and n_row_y the number of chunks - in y dimension. + Coordinates of the upper left corner of the image in format (n_row_y, n_row_x, 4) + where the last dimension defines the rectangular tile in format (y, x, height, width). + n_row_y represents the number of chunks in y dimension (block rows) and n_row_x the number of chunks + in x dimension (block columns). n_channel Number of channels in array dtype @@ -112,27 +115,27 @@ def _read_chunks( # part from the docstring func_kwargs = func_kwargs if func_kwargs else {} - # Collect each delayed chunk as item in list of list - # Inner list becomes dim=-1 (cols/x) - # Outer list becomes dim=-2 (rows/y) + # Collect each delayed chunk (c, y, x) as item in list of list + # Inner list becomes dim=-1 (chunk columns/x) + # Outer list becomes dim=-2 (chunk rows/y) # see dask.array.block chunks = [ [ da.from_delayed( delayed(func)( slide, - x0=coords[x, y, 0], - y0=coords[x, y, 1], - width=coords[x, y, 2], - height=coords[x, y, 3], + x0=coords[chunk_y, chunk_x, 1], + y0=coords[chunk_y, chunk_x, 0], + width=coords[chunk_y, chunk_x, 3], + height=coords[chunk_y, chunk_x, 2], **func_kwargs, ), dtype=dtype, - shape=(n_channel, *coords[y, x, [3, 2]]), + shape=(n_channel, *coords[chunk_y, chunk_x, [3, 2]]), ) - for x in range(coords.shape[0]) + for chunk_x in range(coords.shape[1]) ] - for y in range(coords.shape[1]) + for chunk_y in range(coords.shape[0]) ] return chunks From 5d9ece2911097559f17ab55c1a9f06379c3f10f6 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 16 Jan 2026 10:42:29 +0100 Subject: [PATCH 31/39] [Fix] Shape dimensions were inversed. Fix shape specification and make it self-documenting --- src/spatialdata_io/readers/_utils/_image.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 320ae69c..e2b229de 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -6,6 +6,11 @@ from dask import delayed from numpy.typing import NDArray +_Y_IDX = 0 +_X_IDX = 1 +_HEIGHT_IDX = 2 +_WIDTH_IDX = 3 + def _compute_chunk_sizes_positions(size: int, chunk: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: """Calculate chunk sizes and positions for a given dimension and chunk size.""" @@ -124,14 +129,14 @@ def _read_chunks( da.from_delayed( delayed(func)( slide, - x0=coords[chunk_y, chunk_x, 1], - y0=coords[chunk_y, chunk_x, 0], - width=coords[chunk_y, chunk_x, 3], - height=coords[chunk_y, chunk_x, 2], + y0=coords[chunk_y, chunk_x, _Y_IDX], + x0=coords[chunk_y, chunk_x, _X_IDX], + height=coords[chunk_y, chunk_x, _HEIGHT_IDX], + width=coords[chunk_y, chunk_x, _WIDTH_IDX], **func_kwargs, ), dtype=dtype, - shape=(n_channel, *coords[chunk_y, chunk_x, [3, 2]]), + shape=(n_channel, coords[chunk_y, chunk_x, _HEIGHT_IDX], coords[chunk_y, chunk_x, _WIDTH_IDX]), ) for chunk_x in range(coords.shape[1]) ] From 21f8fc65e538f7bb4aba8c3caa32e040231e1608 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 16 Jan 2026 10:43:22 +0100 Subject: [PATCH 32/39] chore: Remove Note and TODO --- src/spatialdata_io/readers/_utils/_image.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index e2b229de..1857712a 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -103,21 +103,7 @@ def _read_chunks( list[list[da.array]] (Outer) list (length: n_row_y) of (inner) lists (length: n_row_x) of chunks with axes (c, y, x). Represents all chunks of the full image. - - Notes - ------- - As shown in `_compute_chunks()`, `coords` are in the form `(x, y, - width, height)`. In that function, the inner list (dim = -1) iterates over `y` - values, and the outer list (dim = -2) iterates over `x` values. In `_read_chunks( - )`, we use the more common `(y, x)` ordering: the inner list (dim = -1) iterates - over `x` values, and the outer list (dim = -2) iterates over `y` values. - - This mismatch can be confusing. A straightforward fix that one could perform would - be to standardize `coords` to `(y, x, height, width)` instead of - `(x, y, width, height)`. """ - # TODO: standardize `coords` as explained in the docstring above, then remove that - # part from the docstring func_kwargs = func_kwargs if func_kwargs else {} # Collect each delayed chunk (c, y, x) as item in list of list From 93554e7f43574be3a409645a70277d49b0e76d18 Mon Sep 17 00:00:00 2001 From: Lucas Diedrich Date: Fri, 16 Jan 2026 10:55:30 +0100 Subject: [PATCH 33/39] Clarify documentation --- src/spatialdata_io/readers/_utils/_image.py | 23 ++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 1857712a..9603dc54 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -7,9 +7,16 @@ from numpy.typing import NDArray _Y_IDX = 0 +"""Index of y coordinate in in chunk coordinate array format: (y, x, height, width)""" + _X_IDX = 1 +"""Index of x coordinate in chunk coordinate array format: (y, x, height, width)""" + _HEIGHT_IDX = 2 +"""Index of height specification in chunk coordinate array format: (y, x, height, width)""" + _WIDTH_IDX = 3 +"""Index of width specification in chunk coordinate array format: (y, x, height, width)""" def _compute_chunk_sizes_positions(size: int, chunk: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: @@ -27,10 +34,11 @@ def _compute_chunks( ) -> NDArray[np.int_]: """Create all chunk specs for a given image and chunk size. - For each chunk in the final image with the chunk coords (block row=chunk_y_position, block col=chunk_x_position), this function - creates specifications (y, x, height, width) with (y, x) being the upper left corner - of chunks of size chunk_size. Chunks at the edges correspond to the remainder of - chunk size and dimensions + Creates chunk specifications for tiling an image. Returns an array where position [i, j] + contains the spec for the chunk at block row i, block column j. + Each chunk specification consists of (y, x, height, width) with (y, x) being the upper left + corner of chunks of size chunk_size. + Chunks at the edges correspond to the remainder of chunk size and dimensions Parameters ---------- @@ -77,7 +85,7 @@ def _read_chunks( Function to retrieve a single rectangular tile from the slide image. Must take the arguments: - - slide Full slide image + - slide: Full slide image - y0: y (row) coordinate of upper left corner of chunk - x0: x (col) coordinate of upper left corner of chunk - height: Height of chunk @@ -87,8 +95,9 @@ def _read_chunks( slide Slide image in lazily loaded format compatible with `func` coords - Coordinates of the upper left corner of the image in format (n_row_y, n_row_x, 4) - where the last dimension defines the rectangular tile in format (y, x, height, width). + Coordinates of the upper left corner of each chunk image in format (n_row_y, n_row_x, 4) + where the last dimension defines the rectangular tile in format (y, x, height, width), as returned + by :func:`_compute_chunks`. n_row_y represents the number of chunks in y dimension (block rows) and n_row_x the number of chunks in x dimension (block columns). n_channel From 5a20f413c36b3009c028229313c598b6b60e4109 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 16 Jan 2026 11:02:01 +0100 Subject: [PATCH 34/39] fix pre-commit benchmark_image --- benchmarks/benchmark_image.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_image.py b/benchmarks/benchmark_image.py index 5b83f524..ea674ac6 100644 --- a/benchmarks/benchmark_image.py +++ b/benchmarks/benchmark_image.py @@ -6,11 +6,12 @@ import shutil from pathlib import Path +from typing import Any from spatialdata import SpatialData from xarray import DataArray -from spatialdata_io import image # type: ignore[attr-defined] +from spatialdata_io import image # ============================================================================= # CONFIGURATION - Edit these paths to match your setup @@ -50,13 +51,15 @@ class IOBenchmarkImage: ] param_names = ["scale_factors", "use_tiff_memmap", "chunks"] - def setup(self, *_) -> None: + def setup(self, *_: Any) -> None: """Set up paths for benchmarking.""" self.path_read, self.path_write = get_paths() if self.path_write.exists(): shutil.rmtree(self.path_write) - def _convert_image(self, scale_factors, use_tiff_memmap, chunks) -> SpatialData: + def _convert_image( + self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, int] + ) -> SpatialData: """Read image data with specified parameters.""" im = image( input=self.path_read, @@ -79,12 +82,12 @@ def _convert_image(self, scale_factors, use_tiff_memmap, chunks) -> SpatialData: assert sdata["image"].chunksizes["y"] == chunks[1] return sdata - def time_io(self, scale_factors, use_tiff_memmap, chunks) -> None: + def time_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, int]) -> None: """Walltime for data parsing.""" sdata = self._convert_image(scale_factors, use_tiff_memmap, chunks) sdata.write(self.path_write) - def peakmem_io(self, scale_factors, use_tiff_memmap, chunks) -> None: + def peakmem_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, int]) -> None: """Peak memory for data parsing.""" sdata = self._convert_image(scale_factors, use_tiff_memmap, chunks) sdata.write(self.path_write) From a890951e1cb5d16c591bff75b88f3985951a2da1 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 16 Jan 2026 12:50:39 +0100 Subject: [PATCH 35/39] wip fix chunks --- benchmarks/benchmark_image.py | 54 +++++++++++++++++++++++---- benchmarks/benchmark_xenium.py | 4 +- src/spatialdata_io/readers/generic.py | 16 +++++--- 3 files changed, 59 insertions(+), 15 deletions(-) diff --git a/benchmarks/benchmark_image.py b/benchmarks/benchmark_image.py index ea674ac6..5b9b8095 100644 --- a/benchmarks/benchmark_image.py +++ b/benchmarks/benchmark_image.py @@ -11,7 +11,7 @@ from spatialdata import SpatialData from xarray import DataArray -from spatialdata_io import image +from spatialdata_io import image # type: ignore[attr-defined] # ============================================================================= # CONFIGURATION - Edit these paths to match your setup @@ -73,13 +73,29 @@ def _convert_image( # sanity check if scale_factors is None: assert isinstance(sdata["image"], DataArray) + if chunks is not None: + assert ( + sdata["image"].chunksizes["x"][0] == chunks[0] + or sdata["image"].chunksizes["x"][0] == sdata["image"].shape[2] + ) + assert ( + sdata["image"].chunksizes["y"][0] == chunks[1] + or sdata["image"].chunksizes["y"][0] == sdata["image"].shape[1] + ) else: - assert len(sdata["image"].keys()) == len(scale_factors) + assert len(sdata["image"].keys()) == len(scale_factors) + 1 + if chunks is not None: + assert ( + sdata["image"]["scale0"]["image"].chunksizes["x"][0] == chunks[0] + or sdata["image"]["scale0"]["image"].chunksizes["x"][0] + == sdata["image"]["scale0"]["image"].shape[2] + ) + assert ( + sdata["image"]["scale0"]["image"].chunksizes["y"][0] == chunks[1] + or sdata["image"]["scale0"]["image"].chunksizes["y"][0] + == sdata["image"]["scale0"]["image"].shape[1] + ) - if chunks is not None: - # TODO: bug here! - assert sdata["image"].chunksizes["x"] == chunks[0] - assert sdata["image"].chunksizes["y"] == chunks[1] return sdata def time_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, int]) -> None: @@ -96,5 +112,27 @@ def peakmem_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chu if __name__ == "__main__": # Run a single test case for quick verification bench = IOBenchmarkImage() - bench.setup(None, True, (1000, 1000)) - bench.time_io(None, True, (1000, 1000)) + + # bench.setup() + # bench.time_io(None, True, (5000, 5000)) + + # bench.setup() + # bench.time_io(None, True, (1000, 1000)) + + # bench.setup() + # bench.time_io(None, False, (5000, 5000)) + + # bench.setup() + # bench.time_io(None, False, (1000, 1000)) + + # bench.setup() + # bench.time_io([2, 2, 2], True, (5000, 5000)) + + # bench.setup() + # bench.time_io([2, 2, 2], True, (1000, 1000)) + + bench.setup() + bench.time_io([2, 2, 2], False, (5000, 5000)) + + # bench.setup() + # bench.time_io([2, 2, 2], False, (1000, 1000)) diff --git a/benchmarks/benchmark_xenium.py b/benchmarks/benchmark_xenium.py index 22d70888..b1f09afe 100644 --- a/benchmarks/benchmark_xenium.py +++ b/benchmarks/benchmark_xenium.py @@ -103,4 +103,6 @@ def peakmem_io(self) -> None: if __name__ == "__main__": - IOBenchmarkXenium().time_io() + benchmark = IOBenchmarkXenium() + benchmark.setup() + benchmark.time_io() diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index aa126464..1d126192 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -131,11 +131,15 @@ def _reader_func(slide: np.memmap, y0: int, x0: int, height: int, width: int) -> return _read_chunks(_reader_func, slide, coords=chunk_coords, n_channel=n_channel, dtype=slide.dtype) -def _dask_image_imread(input: Path, data_axes: Sequence[str]) -> da.Array: +def _dask_image_imread(input: Path, data_axes: Sequence[str], chunks: tuple[int, int] | None = None) -> da.Array: + if set(data_axes) != {"c", "y", "x"}: + raise NotImplementedError(f"Only 'c', 'y', 'x' axes are supported, got {data_axes}") image = imread(input) - if len(image.shape) == len(data_axes) + 1 and image.shape[0] == 1: - image = np.squeeze(image, axis=0) - return image + if image.ndim != len(data_axes): + raise ValueError(f"Expected image with {len(data_axes)} dimensions, got {image.ndim}") + image = image.transpose(*[data_axes.index(ax) for ax in ["c", "y", "x"]]) + chunks = (1,) + chunks + return image.rechunk(chunks) def image( @@ -187,11 +191,11 @@ def image( use_tiff_memmap = False if input.suffix in [".tiff", ".tif"] and not use_tiff_memmap or input.suffix in [".png", ".jpg", ".jpeg"]: - im = _dask_image_imread(input=input, data_axes=data_axes) + im = _dask_image_imread(input=input, data_axes=data_axes, chunks=chunks) if im is None: raise NotImplementedError(f"File format {input.suffix} not implemented") return Image2DModel.parse( - im, dims=data_axes, transformations={coordinate_system: Identity()}, scale_factors=scale_factors + im, dims=data_axes, transformations={coordinate_system: Identity()}, scale_factors=scale_factors, chunks=chunks ) From 19df32530e463689af8ab108b9c1c84117e396bc Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 16 Jan 2026 16:59:20 +0100 Subject: [PATCH 36/39] improve chunks support --- src/spatialdata_io/readers/_utils/_image.py | 63 +++++++++++++- src/spatialdata_io/readers/generic.py | 91 +++++++++++++++------ tests/readers/test_utils_image.py | 39 +++++++++ tests/test_generic.py | 38 +++++++-- 4 files changed, 194 insertions(+), 37 deletions(-) diff --git a/src/spatialdata_io/readers/_utils/_image.py b/src/spatialdata_io/readers/_utils/_image.py index 9603dc54..3784ccca 100644 --- a/src/spatialdata_io/readers/_utils/_image.py +++ b/src/spatialdata_io/readers/_utils/_image.py @@ -1,10 +1,13 @@ -from collections.abc import Callable +from collections.abc import Callable, Mapping, Sequence from typing import Any import dask.array as da import numpy as np from dask import delayed from numpy.typing import NDArray +from spatialdata.models.models import Chunks_t + +__all__ = ["Chunks_t", "_compute_chunks", "_read_chunks", "normalize_chunks"] _Y_IDX = 0 """Index of y coordinate in in chunk coordinate array format: (y, x, height, width)""" @@ -18,6 +21,8 @@ _WIDTH_IDX = 3 """Index of width specification in chunk coordinate array format: (y, x, height, width)""" +DEFAULT_CHUNK_SIZE = 1000 + def _compute_chunk_sizes_positions(size: int, chunk: int) -> tuple[NDArray[np.int_], NDArray[np.int_]]: """Calculate chunk sizes and positions for a given dimension and chunk size.""" @@ -140,4 +145,58 @@ def _read_chunks( return chunks -# TODO: do a asv debugging for peak mem +def normalize_chunks( + chunks: Chunks_t | None, + axes: Sequence[str], +) -> dict[str, int]: + """Normalize chunk specification to dict format. + + This function converts various chunk formats to a dict mapping dimension names + to chunk sizes. The dict format is preferred because it's explicit about which + dimension gets which chunk size and is compatible with spatialdata. + + Parameters + ---------- + chunks + Chunk specification. Can be: + - None: Uses DEFAULT_CHUNK_SIZE for all axes + - int: Applied to all axes + - tuple[int, ...]: Chunk sizes in order corresponding to axes + - dict: Mapping of axis names to chunk sizes (validated against axes) + axes + Tuple of axis names that defines the expected dimensions (e.g., ('c', 'y', 'x')). + + Returns + ------- + dict[str, int] + Dict mapping axis names to chunk sizes. + + Raises + ------ + ValueError + If chunks format is not supported or incompatible with axes. + """ + if chunks is None: + return dict.fromkeys(axes, DEFAULT_CHUNK_SIZE) + + if isinstance(chunks, int): + return dict.fromkeys(axes, chunks) + + if isinstance(chunks, Mapping): + chunks_dict = dict(chunks) + missing = set(axes) - set(chunks_dict.keys()) + if missing: + raise ValueError(f"chunks dict missing keys for axes {missing}, got: {list(chunks_dict.keys())}") + return {ax: chunks_dict[ax] for ax in axes} + + if isinstance(chunks, tuple): + if len(chunks) != len(axes): + raise ValueError(f"chunks tuple length {len(chunks)} doesn't match axes {axes} (length {len(axes)})") + if not all(isinstance(c, int) for c in chunks): + raise ValueError(f"All elements in chunks tuple must be int, got: {chunks}") + return dict(zip(axes, chunks, strict=True)) + + raise ValueError(f"Unsupported chunks type: {type(chunks)}. Expected int, tuple, dict, or None.") + + +## diff --git a/src/spatialdata_io/readers/generic.py b/src/spatialdata_io/readers/generic.py index 1d126192..462989e0 100644 --- a/src/spatialdata_io/readers/generic.py +++ b/src/spatialdata_io/readers/generic.py @@ -19,13 +19,18 @@ from geopandas import GeoDataFrame from numpy.typing import NDArray + from spatialdata.models.models import Chunks_t from xarray import DataArray -from ._utils._image import _compute_chunks, _read_chunks + +from spatialdata_io.readers._utils._image import ( + _compute_chunks, + _read_chunks, + normalize_chunks, +) VALID_IMAGE_TYPES = [".tif", ".tiff", ".png", ".jpg", ".jpeg"] VALID_SHAPE_TYPES = [".geojson"] -DEFAULT_CHUNK_SIZE = (1000, 1000) __all__ = ["generic", "geojson", "image", "VALID_IMAGE_TYPES", "VALID_SHAPE_TYPES"] @@ -89,26 +94,29 @@ def geojson(input: Path, coordinate_system: str) -> GeoDataFrame: def _tiff_to_chunks( input: Path, axes_dim_mapping: dict[str, int], - chunk_size: tuple[int, int] | None = None, + chunks_cyx: dict[str, int], ) -> list[list[DaskArray[np.number]]]: """Chunkwise reader for tiff files. + Creates spatial tiles from a TIFF file. Each tile contains all channels. + Channel chunking is handled downstream by Image2DModel.parse(). + Parameters ---------- input Path to image axes_dim_mapping Mapping between dimension name (c, y, x) and index - chunk_size - Chunk size in (y, x) order. If None, defaults to DEFAULT_CHUNK_SIZE. + chunks_cyx + Chunk size dict with 'c', 'y', and 'x' keys. The 'y' and 'x' values + are used for spatial tiling. The 'c' value is passed through for + downstream rechunking. Returns ------- list[list[DaskArray]] + 2D list of dask arrays representing spatial tiles, each with shape (n_channels, height, width). """ - if chunk_size is None: - chunk_size = DEFAULT_CHUNK_SIZE - # Lazy file reader slide = tifffile.memmap(input) @@ -118,28 +126,49 @@ def _tiff_to_chunks( # Get dimensions in (y, x) slide_dimensions = slide.shape[1], slide.shape[2] - # Get number of channels (c) + # Get number of channels (all channels are included in each spatial tile) n_channel = slide.shape[0] - # Compute chunk coords - chunk_coords = _compute_chunks(slide_dimensions, chunk_size=chunk_size) + # Compute chunk coords using (y, x) tuple + chunk_coords = _compute_chunks(slide_dimensions, chunk_size=(chunks_cyx["y"], chunks_cyx["x"])) - # Define reader func + # Define reader func - reads all channels for each spatial tile def _reader_func(slide: np.memmap, y0: int, x0: int, height: int, width: int) -> NDArray[np.number]: return np.array(slide[:, y0 : y0 + height, x0 : x0 + width]) return _read_chunks(_reader_func, slide, coords=chunk_coords, n_channel=n_channel, dtype=slide.dtype) -def _dask_image_imread(input: Path, data_axes: Sequence[str], chunks: tuple[int, int] | None = None) -> da.Array: +def _dask_image_imread(input: Path, data_axes: Sequence[str], chunks_cyx: dict[str, int]) -> da.Array: + """Read image using dask-image and rechunk. + + Parameters + ---------- + input + Path to image file. + data_axes + Axes of the input data. + chunks_cyx + Chunk size dict with 'c', 'y', 'x' keys. + + Returns + ------- + Dask array with (c, y, x) axes order. + """ if set(data_axes) != {"c", "y", "x"}: raise NotImplementedError(f"Only 'c', 'y', 'x' axes are supported, got {data_axes}") - image = imread(input) - if image.ndim != len(data_axes): - raise ValueError(f"Expected image with {len(data_axes)} dimensions, got {image.ndim}") - image = image.transpose(*[data_axes.index(ax) for ax in ["c", "y", "x"]]) - chunks = (1,) + chunks - return image.rechunk(chunks) + im = imread(input) + + # dask_image.imread may add an extra leading dimension for frames/pages + # If image has one extra dimension with size 1, squeeze it out + if im.ndim == len(data_axes) + 1 and im.shape[0] == 1: + im = im[0] + + if im.ndim != len(data_axes): + raise ValueError(f"Expected image with {len(data_axes)} dimensions, got {im.ndim}") + + im = im.transpose(*[data_axes.index(ax) for ax in ["c", "y", "x"]]) + return im.rechunk((chunks_cyx["c"], chunks_cyx["y"], chunks_cyx["x"])) def image( @@ -147,7 +176,7 @@ def image( data_axes: Sequence[str], coordinate_system: str, use_tiff_memmap: bool = True, - chunks: tuple[int, int] | None = None, + chunks: Chunks_t | None = None, scale_factors: Sequence[int] | None = None, ) -> DataArray: """Reads an image file and returns a parsed Image2D spatial element. @@ -157,13 +186,17 @@ def image( input Path to the image file. data_axes - Axes of the data. + Axes of the data (e.g., ('c', 'y', 'x') or ('y', 'x', 'c')). coordinate_system Coordinate system of the spatial element. use_tiff_memmap Whether to use memory-mapped reading for TIFF files. chunks - Chunk size in (y, x) order for TIFF files. If None, defaults to (1000, 1000). + Chunk size specification. Can be: + - int: Applied to all dimensions + - tuple: Chunk sizes matching the order of output axes (c, y, x) + - dict: Mapping of axis names to chunk sizes (e.g., {'c': 1, 'y': 1000, 'x': 1000}) + If None, uses a default (DEFAULT_CHUNK_SIZE) for all axes. scale_factors Scale factors for building a multiscale image pyramid. Passed to Image2DModel.parse(). @@ -174,12 +207,13 @@ def image( # Map passed data axes to position of dimension axes_dim_mapping = {axes: ndim for ndim, axes in enumerate(data_axes)} + chunks_dict = normalize_chunks(chunks, axes=data_axes) + im = None if input.suffix in [".tiff", ".tif"] and use_tiff_memmap: try: - im_chunks = _tiff_to_chunks(input, axes_dim_mapping=axes_dim_mapping, chunk_size=chunks) + im_chunks = _tiff_to_chunks(input, axes_dim_mapping=axes_dim_mapping, chunks_cyx=chunks_dict) im = da.block(im_chunks, allow_unknown_chunksizes=True) - data_axes = ["c", "y", "x"] # Edge case: Compressed images are not memory-mappable except ValueError as e: @@ -191,11 +225,16 @@ def image( use_tiff_memmap = False if input.suffix in [".tiff", ".tif"] and not use_tiff_memmap or input.suffix in [".png", ".jpg", ".jpeg"]: - im = _dask_image_imread(input=input, data_axes=data_axes, chunks=chunks) + im = _dask_image_imread(input=input, data_axes=data_axes, chunks_cyx=chunks_dict) if im is None: raise NotImplementedError(f"File format {input.suffix} not implemented") + # the output axes are always cyx return Image2DModel.parse( - im, dims=data_axes, transformations={coordinate_system: Identity()}, scale_factors=scale_factors, chunks=chunks + im, + dims=("c", "y", "x"), + transformations={coordinate_system: Identity()}, + scale_factors=scale_factors, + chunks=chunks_dict, ) diff --git a/tests/readers/test_utils_image.py b/tests/readers/test_utils_image.py index b6648556..c2e08230 100644 --- a/tests/readers/test_utils_image.py +++ b/tests/readers/test_utils_image.py @@ -3,8 +3,11 @@ from numpy.typing import NDArray from spatialdata_io.readers._utils._image import ( + DEFAULT_CHUNK_SIZE, + Chunks_t, _compute_chunk_sizes_positions, _compute_chunks, + normalize_chunks, ) @@ -62,3 +65,39 @@ def test_compute_chunks( tiles = _compute_chunks(dimensions, chunk_size) assert (tiles == result).all() + + +@pytest.mark.parametrize( + "chunks, axes, expected", + [ + # 2D (y, x) + (None, ("y", "x"), {"y": DEFAULT_CHUNK_SIZE, "x": DEFAULT_CHUNK_SIZE}), + (256, ("y", "x"), {"y": 256, "x": 256}), + ((200, 100), ("x", "y"), {"y": 100, "x": 200}), + ({"y": 300, "x": 400}, ("x", "y"), {"y": 300, "x": 400}), + # 2D with channel (c, y, x) + (None, ("c", "y", "x"), {"c": DEFAULT_CHUNK_SIZE, "y": DEFAULT_CHUNK_SIZE, "x": DEFAULT_CHUNK_SIZE}), + (256, ("c", "y", "x"), {"c": 256, "y": 256, "x": 256}), + ((1, 100, 200), ("c", "y", "x"), {"c": 1, "y": 100, "x": 200}), + ({"c": 1, "y": 300, "x": 400}, ("c", "y", "x"), {"c": 1, "y": 300, "x": 400}), + # 3D (z, y, x) + ((10, 100, 200), ("z", "y", "x"), {"z": 10, "y": 100, "x": 200}), + ({"z": 10, "y": 300, "x": 400}, ("z", "y", "x"), {"z": 10, "y": 300, "x": 400}), + ], +) +def test_normalize_chunks_valid(chunks: Chunks_t, axes: tuple[str, ...], expected: dict[str, int]) -> None: + assert normalize_chunks(chunks, axes=axes) == expected + + +@pytest.mark.parametrize( + "chunks, axes, match", + [ + ({"y": 100}, ("y", "x"), "missing keys for axes"), + ((1, 2, 3), ("y", "x"), "doesn't match axes"), + ((1.5, 2), ("y", "x"), "must be int"), + ("invalid", ("y", "x"), "Unsupported chunks type"), + ], +) +def test_normalize_chunks_errors(chunks: Chunks_t, axes: tuple[str, ...], match: str) -> None: + with pytest.raises(ValueError, match=match): + normalize_chunks(chunks, axes=axes) diff --git a/tests/test_generic.py b/tests/test_generic.py index 374d96e2..dd46e15d 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -64,22 +64,42 @@ def save_tiff_files( yield tiff_path, axes, compression -def test_read_tiff(save_tiff_files: tuple[Path, tuple[str], str | None], caplog: pytest.LogCaptureFixture) -> None: +@pytest.mark.parametrize("scale_factors", [None, [2]]) +def test_read_tiff( + save_tiff_files: tuple[Path, tuple[str, ...], str | None], + caplog: pytest.LogCaptureFixture, + scale_factors: list[int] | None, +) -> None: tiff_path, axes, compression = save_tiff_files + # Use asymmetric chunk sizes to catch errors with the ordering of chunk dimensions and the assembly of the individual chunks + CHUNKS = {"c": 2, "y": 29, "x": 71} logger.propagate = True with caplog.at_level(logging.WARNING): - # Use asymmetric chunk sizes to catch errors with the ordering of chunk dimensions and the assembly of the individual chunks - img = image(tiff_path, data_axes=axes, chunks=(29, 71), coordinate_system="global") - if compression is not None: - assert "image data is not memory-mappable, potentially due to compression" in caplog.text - + obj = image( + tiff_path, + data_axes=axes, + coordinate_system="global", + chunks=CHUNKS, + scale_factors=scale_factors, + use_tiff_memmap=True, + ) logger.propagate = False + assert ("image data is not memory-mappable" in caplog.text) == (compression is not None) + + target = obj if scale_factors is None else obj["scale0"]["image"] + + # check chunks are correct + for i, ax in enumerate(("c", "y", "x")): + assert target.chunksizes[ax][0] in (CHUNKS[ax], target.shape[i]) - reference = tiffread(tiff_path) - reference_cyx = reference.transpose(*[axes.index(ax) for ax in ["c", "y", "x"]]) + # check multiscale is correct + if scale_factors is not None: + assert "scale0" in obj and len(obj.keys()) == len(scale_factors) + 1 - assert (img.compute() == reference_cyx).all() + # check pixel data + ref = tiffread(tiff_path).transpose(*[axes.index(ax) for ax in ("c", "y", "x")]) + assert (target.compute() == ref).all() @pytest.mark.parametrize("cli", [True, False]) From 163d8b040c24623ad0e11409fd58bdad7700af37 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 16 Jan 2026 18:38:05 +0100 Subject: [PATCH 37/39] benchmark for image() with synthetic data --- .gitignore | 1 + benchmarks/benchmark_image.py | 187 ++++++++++++++++++++++------------ 2 files changed, 125 insertions(+), 63 deletions(-) diff --git a/.gitignore b/.gitignore index 19b9d658..35db05af 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ temp/ # Compiled files __pycache__/ +.ipynb_checkpoints/ # Distribution / packaging /build/ diff --git a/benchmarks/benchmark_image.py b/benchmarks/benchmark_image.py index 5b9b8095..a872b5c2 100644 --- a/benchmarks/benchmark_image.py +++ b/benchmarks/benchmark_image.py @@ -4,36 +4,28 @@ See benchmark_xenium.py for instructions. """ -import shutil +import logging +import logging.handlers +import tempfile from pathlib import Path from typing import Any +import numpy as np +import tifffile from spatialdata import SpatialData +from spatialdata._logging import logger from xarray import DataArray from spatialdata_io import image # type: ignore[attr-defined] # ============================================================================= -# CONFIGURATION - Edit these paths to match your setup +# CONFIGURATION - Edit these values to match your setup # ============================================================================= -SANDBOX_DIR = Path(__file__).parent.parent.parent / "spatialdata-sandbox" -DATASET = "xenium_2.0.0_io" +# Image dimensions: (channels, height, width) +IMAGE_SHAPE = (3, 30000, 30000) # ============================================================================= -def get_paths() -> tuple[Path, Path]: - """Get paths for benchmark data.""" - path = SANDBOX_DIR / DATASET - # TODO: this is not a good image because it's compressed (not memmappable) - path_read = path / "data" / "morphology.ome.tif" - path_write = path / "data_benchmark.zarr" - - if not path_read.exists(): - raise ValueError(f"Data directory not found: {path_read}") - - return path_read, path_write - - class IOBenchmarkImage: """Benchmark IO read operations with different parameter combinations.""" @@ -45,37 +37,93 @@ class IOBenchmarkImage: # Parameter combinations: scale_factors, use_tiff_memmap, chunks params = [ - [None, [2, 2, 2]], # scale_factors + [None, [2, 2]], # scale_factors [True, False], # use_tiff_memmap - [(5000, 5000), (1000, 1000)], # chunks + [(1, 5000, 5000), (1, 1000, 1000)], # chunks ] param_names = ["scale_factors", "use_tiff_memmap", "chunks"] + # Class-level temp directory for image files (persists across all benchmarks) + _images_temp_dir: tempfile.TemporaryDirectory[str] | None = None + _path_read_uncompressed: Path | None = None + _path_read_compressed: Path | None = None + + @classmethod + def _setup_images(cls) -> None: + """Create fake image data once for all benchmarks.""" + if cls._images_temp_dir is not None: + return + + cls._images_temp_dir = tempfile.TemporaryDirectory() + images_dir = Path(cls._images_temp_dir.name) + cls._path_read_uncompressed = images_dir / "image_uncompressed.tif" + cls._path_read_compressed = images_dir / "image_compressed.tif" + + # Generate fake image data + rng = np.random.default_rng(42) + data = rng.integers(0, 255, size=IMAGE_SHAPE, dtype=np.uint8) + + # Write uncompressed TIFF (memmappable) + tifffile.imwrite(cls._path_read_uncompressed, data, compression=None) + # Write compressed TIFF (not memmappable) + tifffile.imwrite(cls._path_read_compressed, data, compression="zlib") + def setup(self, *_: Any) -> None: """Set up paths for benchmarking.""" - self.path_read, self.path_write = get_paths() - if self.path_write.exists(): - shutil.rmtree(self.path_write) + # Create images once (shared across all benchmark runs) + self._setup_images() + self.path_read_uncompressed = self._path_read_uncompressed + self.path_read_compressed = self._path_read_compressed + + # Create a separate temp directory for output (cleaned up after each run) + self._output_temp_dir = tempfile.TemporaryDirectory() + self.path_write = Path(self._output_temp_dir.name) / "data_benchmark.zarr" + + def teardown(self, *_: Any) -> None: + """Clean up output directory after each benchmark run.""" + if hasattr(self, "_output_temp_dir"): + self._output_temp_dir.cleanup() def _convert_image( - self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, int] + self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, ...] ) -> SpatialData: """Read image data with specified parameters.""" - im = image( - input=self.path_read, - data_axes=("c", "y", "x"), - coordinate_system="global", - use_tiff_memmap=use_tiff_memmap, - chunks=chunks, - scale_factors=scale_factors, - ) + # Use uncompressed (memmappable) for use_tiff_memmap=True, compressed otherwise + path_read = self.path_read_uncompressed if use_tiff_memmap else self.path_read_compressed + + # Capture log messages to verify memmappable warning behavior + log_capture = logging.handlers.MemoryHandler(capacity=100) + log_capture.setLevel(logging.WARNING) + logger.addHandler(log_capture) + original_propagate = logger.propagate + logger.propagate = True + + try: + im = image( + input=path_read, + data_axes=("c", "y", "x"), + coordinate_system="global", + use_tiff_memmap=use_tiff_memmap, + chunks=chunks, + scale_factors=scale_factors, + ) + finally: + logger.removeHandler(log_capture) + logger.propagate = original_propagate + + # Check warning behavior: when use_tiff_memmap=True, no compression warning should be raised + log_messages = [record.getMessage() for record in log_capture.buffer] + has_memmap_warning = any("image data is not memory-mappable" in msg for msg in log_messages) + if use_tiff_memmap: + assert not has_memmap_warning, "Uncompressed TIFF should not trigger memory-mappable warning" + sdata = SpatialData.init_from_elements({"image": im}) - # sanity check + # sanity check: chunks is (c, y, x) if scale_factors is None: assert isinstance(sdata["image"], DataArray) if chunks is not None: assert ( - sdata["image"].chunksizes["x"][0] == chunks[0] + sdata["image"].chunksizes["x"][0] == chunks[2] or sdata["image"].chunksizes["x"][0] == sdata["image"].shape[2] ) assert ( @@ -86,7 +134,7 @@ def _convert_image( assert len(sdata["image"].keys()) == len(scale_factors) + 1 if chunks is not None: assert ( - sdata["image"]["scale0"]["image"].chunksizes["x"][0] == chunks[0] + sdata["image"]["scale0"]["image"].chunksizes["x"][0] == chunks[2] or sdata["image"]["scale0"]["image"].chunksizes["x"][0] == sdata["image"]["scale0"]["image"].shape[2] ) @@ -98,41 +146,54 @@ def _convert_image( return sdata - def time_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, int]) -> None: + def time_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, ...]) -> None: """Walltime for data parsing.""" sdata = self._convert_image(scale_factors, use_tiff_memmap, chunks) sdata.write(self.path_write) - def peakmem_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, int]) -> None: + def peakmem_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, ...]) -> None: """Peak memory for data parsing.""" sdata = self._convert_image(scale_factors, use_tiff_memmap, chunks) sdata.write(self.path_write) -if __name__ == "__main__": - # Run a single test case for quick verification - bench = IOBenchmarkImage() - - # bench.setup() - # bench.time_io(None, True, (5000, 5000)) - - # bench.setup() - # bench.time_io(None, True, (1000, 1000)) - - # bench.setup() - # bench.time_io(None, False, (5000, 5000)) - - # bench.setup() - # bench.time_io(None, False, (1000, 1000)) - - # bench.setup() - # bench.time_io([2, 2, 2], True, (5000, 5000)) - - # bench.setup() - # bench.time_io([2, 2, 2], True, (1000, 1000)) - - bench.setup() - bench.time_io([2, 2, 2], False, (5000, 5000)) - - # bench.setup() - # bench.time_io([2, 2, 2], False, (1000, 1000)) +# if __name__ == "__main__": +# # Run a single test case for quick verification +# bench = IOBenchmarkImage() +# +# bench.setup() +# bench.time_io(None, True, (1, 5000, 5000)) +# bench.teardown() +# +# bench.setup() +# bench.time_io(None, True, (1, 1000, 1000)) +# bench.teardown() +# +# bench.setup() +# bench.time_io(None, False, (1, 5000, 5000)) +# bench.teardown() +# +# bench.setup() +# bench.time_io(None, False, (1, 1000, 1000)) +# bench.teardown() +# +# bench.setup() +# bench.time_io([2, 2, 2], True, (1, 5000, 5000)) +# bench.teardown() +# +# bench.setup() +# bench.time_io([2, 2, 2], True, (1, 1000, 1000)) +# bench.teardown() +# +# bench.setup() +# bench.time_io([2, 2, 2], False, (1, 5000, 5000)) +# bench.teardown() +# +# bench.setup() +# bench.time_io([2, 2, 2], False, (1, 1000, 1000)) +# bench.teardown() +# +# # Clean up the shared images temp directory at the end +# if IOBenchmarkImage._images_temp_dir is not None: +# IOBenchmarkImage._images_temp_dir.cleanup() +# IOBenchmarkImage._images_temp_dir = None From 4b4a00e716d257387c140e48c8424d8fe8fcc433 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Fri, 16 Jan 2026 19:03:59 +0100 Subject: [PATCH 38/39] fix pre-commit --- benchmarks/benchmark_image.py | 65 +++++++++++++---------------------- 1 file changed, 23 insertions(+), 42 deletions(-) diff --git a/benchmarks/benchmark_image.py b/benchmarks/benchmark_image.py index a872b5c2..ce46dc94 100644 --- a/benchmarks/benchmark_image.py +++ b/benchmarks/benchmark_image.py @@ -35,13 +35,14 @@ class IOBenchmarkImage: warmup_time = 0 processes = 1 - # Parameter combinations: scale_factors, use_tiff_memmap, chunks + # Parameter combinations: scale_factors, (use_tiff_memmap, compressed), chunks + # Combinations: (memmap=False, compressed=True), (memmap=False, compressed=False), (memmap=True, compressed=False) params = [ [None, [2, 2]], # scale_factors - [True, False], # use_tiff_memmap + [(False, True), (False, False), (True, False)], # (use_tiff_memmap, compressed) [(1, 5000, 5000), (1, 1000, 1000)], # chunks ] - param_names = ["scale_factors", "use_tiff_memmap", "chunks"] + param_names = ["scale_factors", "memmap_compressed", "chunks"] # Class-level temp directory for image files (persists across all benchmarks) _images_temp_dir: tempfile.TemporaryDirectory[str] | None = None @@ -85,11 +86,13 @@ def teardown(self, *_: Any) -> None: self._output_temp_dir.cleanup() def _convert_image( - self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, ...] + self, scale_factors: list[int] | None, memmap_compressed: tuple[bool, bool], chunks: tuple[int, ...] ) -> SpatialData: """Read image data with specified parameters.""" - # Use uncompressed (memmappable) for use_tiff_memmap=True, compressed otherwise - path_read = self.path_read_uncompressed if use_tiff_memmap else self.path_read_compressed + use_tiff_memmap, compressed = memmap_compressed + # Select file based on compression setting + path_read = self.path_read_compressed if compressed else self.path_read_uncompressed + assert path_read is not None # Capture log messages to verify memmappable warning behavior log_capture = logging.handlers.MemoryHandler(capacity=100) @@ -111,11 +114,13 @@ def _convert_image( logger.removeHandler(log_capture) logger.propagate = original_propagate - # Check warning behavior: when use_tiff_memmap=True, no compression warning should be raised + # Check warning behavior: when use_tiff_memmap=True with uncompressed file, no warning should be raised log_messages = [record.getMessage() for record in log_capture.buffer] has_memmap_warning = any("image data is not memory-mappable" in msg for msg in log_messages) - if use_tiff_memmap: - assert not has_memmap_warning, "Uncompressed TIFF should not trigger memory-mappable warning" + if use_tiff_memmap and not compressed: + assert not has_memmap_warning, ( + "Uncompressed TIFF with memmap=True should not trigger memory-mappable warning" + ) sdata = SpatialData.init_from_elements({"image": im}) # sanity check: chunks is (c, y, x) @@ -146,14 +151,18 @@ def _convert_image( return sdata - def time_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, ...]) -> None: + def time_io( + self, scale_factors: list[int] | None, memmap_compressed: tuple[bool, bool], chunks: tuple[int, ...] + ) -> None: """Walltime for data parsing.""" - sdata = self._convert_image(scale_factors, use_tiff_memmap, chunks) + sdata = self._convert_image(scale_factors, memmap_compressed, chunks) sdata.write(self.path_write) - def peakmem_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chunks: tuple[int, ...]) -> None: + def peakmem_io( + self, scale_factors: list[int] | None, memmap_compressed: tuple[bool, bool], chunks: tuple[int, ...] + ) -> None: """Peak memory for data parsing.""" - sdata = self._convert_image(scale_factors, use_tiff_memmap, chunks) + sdata = self._convert_image(scale_factors, memmap_compressed, chunks) sdata.write(self.path_write) @@ -162,35 +171,7 @@ def peakmem_io(self, scale_factors: list[int] | None, use_tiff_memmap: bool, chu # bench = IOBenchmarkImage() # # bench.setup() -# bench.time_io(None, True, (1, 5000, 5000)) -# bench.teardown() -# -# bench.setup() -# bench.time_io(None, True, (1, 1000, 1000)) -# bench.teardown() -# -# bench.setup() -# bench.time_io(None, False, (1, 5000, 5000)) -# bench.teardown() -# -# bench.setup() -# bench.time_io(None, False, (1, 1000, 1000)) -# bench.teardown() -# -# bench.setup() -# bench.time_io([2, 2, 2], True, (1, 5000, 5000)) -# bench.teardown() -# -# bench.setup() -# bench.time_io([2, 2, 2], True, (1, 1000, 1000)) -# bench.teardown() -# -# bench.setup() -# bench.time_io([2, 2, 2], False, (1, 5000, 5000)) -# bench.teardown() -# -# bench.setup() -# bench.time_io([2, 2, 2], False, (1, 1000, 1000)) +# bench.time_io(None, (True, False), (1, 5000, 5000)) # bench.teardown() # # # Clean up the shared images temp directory at the end From 355b695788d0edc3ad46e087da50308345710f9e Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 18 Jan 2026 16:22:43 +0100 Subject: [PATCH 39/39] c=1 vs c=3 benchmark --- benchmarks/benchmark_image.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_image.py b/benchmarks/benchmark_image.py index ce46dc94..3096fee7 100644 --- a/benchmarks/benchmark_image.py +++ b/benchmarks/benchmark_image.py @@ -40,7 +40,7 @@ class IOBenchmarkImage: params = [ [None, [2, 2]], # scale_factors [(False, True), (False, False), (True, False)], # (use_tiff_memmap, compressed) - [(1, 5000, 5000), (1, 1000, 1000)], # chunks + [(1, 250, 250), (3, 250, 250)], # chunks ] param_names = ["scale_factors", "memmap_compressed", "chunks"]