Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies = [
"click",
"dask-image",
"dask>=2025.2.0,<2026.1.2",
"distributed<2026.1.2",
"datashader",
"fsspec[s3,http]",
"geopandas>=0.14",
Expand Down
44 changes: 29 additions & 15 deletions src/spatialdata/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ def parse(
else:
# Chunk single scale images
if chunks is not None:
if isinstance(chunks, tuple):
chunks = {dim: chunks[index] for index, dim in enumerate(data.dims)}
data = data.chunk(chunks=chunks)
cls()._check_chunk_size_not_too_large(data)
# recompute coordinates for (multiscale) spatial image
Expand Down Expand Up @@ -819,19 +821,23 @@ def _(
# TODO: dask does not allow for setting divisions directly anymore. We have to decide on forcing the user.
if feature_key is not None:
feature_categ = dd.from_pandas(
data[feature_key].astype(str).astype("category"),
data[feature_key],
sort=sort,
**kwargs,
)
table[feature_key] = feature_categ
elif isinstance(data, dd.DataFrame):
table = data[[coordinates[ax] for ax in axes]]
table.columns = axes
if feature_key is not None:
if data[feature_key].dtype.name == "category":
table[feature_key] = data[feature_key]
else:
table[feature_key] = data[feature_key].astype(str).astype("category")

if feature_key is not None:
if data[feature_key].dtype.name == "category":
table[feature_key] = data[feature_key]
else:
# this will cause the categories to be unknown and trigger the warning (and performance slowdown) in
# _add_metadata_and_validate()
table[feature_key] = data[feature_key].astype(str).astype("category")

if instance_key is not None:
table[instance_key] = data[instance_key]
for c in [X, Y, Z]:
Expand Down Expand Up @@ -885,15 +891,20 @@ def _add_metadata_and_validate(
assert instance_key in data.columns
data.attrs[ATTRS_KEY][cls.INSTANCE_KEY] = instance_key

for c in data.columns:
# Here we are explicitly importing the categories
# but it is a convenient way to ensure that the categories are known.
# It also just changes the state of the series, so it is not a big deal.
if isinstance(data[c].dtype, CategoricalDtype) and not data[c].cat.known:
try:
data[c] = data[c].cat.set_categories(data[c].compute().cat.categories)
except ValueError:
logger.info(f"Column `{c}` contains unknown categories. Consider casting it.")
if (
feature_key is not None
and isinstance(data[feature_key].dtype, CategoricalDtype)
and not data[feature_key].cat.known
):
logger.warning(
f"The `feature_key` column {feature_key} is categorical with unknown categories. "
"Please ensure the categories are known before calling `PointsModel.parse()` to "
"avoid significant performance implications due to the need for dask to compute "
"the categories. If you did not use PointsModel.parse() explicitly in your code ("
"e.g. this message is coming from a reader in `spatialdata_io`), please report "
"this finding."
)
data[feature_key] = data[feature_key].cat.set_categories(data[feature_key].compute().cat.categories)

_parse_transformations(data, transformations)
cls.validate(data)
Expand Down Expand Up @@ -1153,6 +1164,9 @@ def parse(
The parsed data.
"""
validate_table_attr_keys(adata)
# Convert view to actual copy to avoid ImplicitModificationWarning when modifying .uns
if adata.is_view:
adata = adata.copy()
# either all live in adata.uns or all be passed in as argument
n_args = sum([region is not None, region_key is not None, instance_key is not None])
if n_args == 0:
Expand Down
Loading