From b6567ea7202ae738c8396bc17b3d9894f2a90b1b Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Tue, 7 Apr 2026 12:52:13 +0200 Subject: [PATCH 01/29] Add test_from_xarray_dataset_dict --- tests/datasets/test_utils.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 tests/datasets/test_utils.py diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py new file mode 100644 index 000000000..21ae6a2ce --- /dev/null +++ b/tests/datasets/test_utils.py @@ -0,0 +1,14 @@ +from parcels._datasets.structured.generic import datasets +from parcels._datasets.utils import from_xarray_dataset_dict + + +def test_from_xarray_dataset_dict(): + ds_expected = datasets["ds_2d_left"] + d = ds_expected.to_dict(data=False) + ds = from_xarray_dataset_dict(d) + + assert list(ds.coords) == list(ds_expected.coords) + assert list(ds.data_vars) == list(ds_expected.data_vars) + + for k in set(ds.coords) | set(ds.data_vars): + assert ds[k].attrs == ds_expected[k].attrs, f"Attrs for {k!r} do not match" From 2d66397b94e8dde9ec4c08eecea7251666254f4d Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Tue, 7 Apr 2026 13:31:20 +0200 Subject: [PATCH 02/29] Add tooling for serializing and deserializing from JSON --- src/parcels/_datasets/utils.py | 69 +++++++++++++++++++++++++++++++++- tests/datasets/test_utils.py | 15 +++++++- 2 files changed, 81 insertions(+), 3 deletions(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index f8af6b7e0..faea3bba5 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -1,10 +1,15 @@ import copy -from typing import Any +from collections.abc import Mapping +from typing import Any, TypeVar, cast import numpy as np import xarray as xr +from parcels._typing import PathLike + _SUPPORTED_ATTR_TYPES = int | float | str | np.ndarray +K = TypeVar("K") +V = TypeVar("V") def _print_mismatched_keys(d1: dict[Any, Any], d2: dict[Any, Any]) -> None: @@ -203,6 +208,68 @@ def from_xarray_dataset_dict(d) -> xr.Dataset: return xr.Dataset.from_dict(_fill_with_dummy_data(copy.deepcopy(d))) +def dataset_from_json(path: PathLike) -> xr.Dataset: + import json + + with open(path, "rb") as f: + d = json.load(f) + assert d["version"] == "1", f"Version of TOML CDL representation must be '1'. Got {d['version']!r}" + + ds_dict = _fill_with_dummy_data(d["dataset"]) + + return xr.Dataset.from_dict(ds_dict) + + +def dataset_to_json(ds: xr.Dataset, path: PathLike) -> None: + import json + + with open(path, "w") as f: + d = { + "version": "1", + "dataset": _dataset_to_dict_with_coordinate_arrays(ds), + } + json.dump(d, f) + return + + +def _decode_numpy_dict_values(attrs: Mapping[K, V]) -> dict[K, V]: + """Convert attribute values from numpy objects to native Python objects, + for use in to_dict + """ + attrs = dict(attrs) + for k, v in attrs.items(): + if isinstance(v, np.ndarray): + attrs[k] = cast(V, v.tolist()) + elif isinstance(v, np.generic): + attrs[k] = v.item() + return attrs + + +def _dataset_to_dict_with_coordinate_arrays(ds: xr.Dataset) -> dict: + # Implementation mostly copied from xr.Dataset.to_dict() + encoding = True + + d: dict = { + "coords": {}, + "attrs": _decode_numpy_dict_values(ds.attrs), + "dims": dict(ds.sizes), + "data_vars": {}, + } + for k in ds.coords: + d["coords"].update( + { + k: ds[k].variable.to_dict(data="list", encoding=encoding) + } # data='list' so coordinates are written to file + ) + for k in ds.data_vars: + d["data_vars"].update( + {k: ds[k].variable.to_dict(data=False, encoding=encoding)} # data=False so that data isn't writen to file + ) + if encoding: + d["encoding"] = dict(ds.encoding) + return d + + def _fill_with_dummy_data(d: dict[str, dict]): assert isinstance(d, dict) if "dtype" in d: diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py index 21ae6a2ce..32816646b 100644 --- a/tests/datasets/test_utils.py +++ b/tests/datasets/test_utils.py @@ -1,14 +1,25 @@ +import pytest +import xarray as xr + +from parcels._datasets import utils from parcels._datasets.structured.generic import datasets -from parcels._datasets.utils import from_xarray_dataset_dict def test_from_xarray_dataset_dict(): ds_expected = datasets["ds_2d_left"] d = ds_expected.to_dict(data=False) - ds = from_xarray_dataset_dict(d) + ds = utils.from_xarray_dataset_dict(d) assert list(ds.coords) == list(ds_expected.coords) assert list(ds.data_vars) == list(ds_expected.data_vars) for k in set(ds.coords) | set(ds.data_vars): assert ds[k].attrs == ds_expected[k].attrs, f"Attrs for {k!r} do not match" + + +@pytest.mark.parametrize("ds", [pytest.param(v, id=k) for k, v in datasets.items()]) +def test_dataset_json_roundtrip(ds: xr.Dataset, tmp_path): + path = tmp_path / "dataset-metadata.json" + utils.dataset_to_json(ds, path) + ds_parsed = utils.dataset_from_json(path) # noqa: F841 + # breakpoint() From 3e55fbf0ad0b19211a64ffb674d9e5f4c22997a5 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Tue, 7 Apr 2026 13:51:30 +0200 Subject: [PATCH 03/29] Account for datetimes by serializing to iso strings --- src/parcels/_datasets/utils.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index faea3bba5..d255f0861 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -1,5 +1,7 @@ import copy +import re from collections.abc import Mapping +from datetime import datetime from typing import Any, TypeVar, cast import numpy as np @@ -7,6 +9,8 @@ from parcels._typing import PathLike +_ISO_DATETIME_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}") + _SUPPORTED_ATTR_TYPES = int | float | str | np.ndarray K = TypeVar("K") V = TypeVar("V") @@ -208,6 +212,28 @@ def from_xarray_dataset_dict(d) -> xr.Dataset: return xr.Dataset.from_dict(_fill_with_dummy_data(copy.deepcopy(d))) +def _datetimes_to_iso(obj: Any) -> Any: + """Recursively convert datetime objects to ISO format strings.""" + if isinstance(obj, datetime): + return obj.isoformat() + if isinstance(obj, list): + return [_datetimes_to_iso(v) for v in obj] + if isinstance(obj, dict): + return {k: _datetimes_to_iso(v) for k, v in obj.items()} + return obj + + +def _iso_to_datetimes(obj: Any) -> Any: + """Recursively convert ISO datetime strings back to datetime objects.""" + if isinstance(obj, str) and _ISO_DATETIME_RE.match(obj): + return datetime.fromisoformat(obj) + if isinstance(obj, list): + return [_iso_to_datetimes(v) for v in obj] + if isinstance(obj, dict): + return {k: _iso_to_datetimes(v) for k, v in obj.items()} + return obj + + def dataset_from_json(path: PathLike) -> xr.Dataset: import json @@ -215,7 +241,7 @@ def dataset_from_json(path: PathLike) -> xr.Dataset: d = json.load(f) assert d["version"] == "1", f"Version of TOML CDL representation must be '1'. Got {d['version']!r}" - ds_dict = _fill_with_dummy_data(d["dataset"]) + ds_dict = _fill_with_dummy_data(_iso_to_datetimes(d["dataset"])) return xr.Dataset.from_dict(ds_dict) @@ -226,7 +252,7 @@ def dataset_to_json(ds: xr.Dataset, path: PathLike) -> None: with open(path, "w") as f: d = { "version": "1", - "dataset": _dataset_to_dict_with_coordinate_arrays(ds), + "dataset": _datetimes_to_iso(_dataset_to_dict_with_coordinate_arrays(ds)), } json.dump(d, f) return From 3f1979fbf25568ad5b6ebba16055e8ac3585de22 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Tue, 7 Apr 2026 13:53:39 +0200 Subject: [PATCH 04/29] Refactor to json.JSON*coder classes --- src/parcels/_datasets/utils.py | 62 ++++++++++++++++++---------------- tests/datasets/test_utils.py | 11 ++++++ 2 files changed, 43 insertions(+), 30 deletions(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index d255f0861..7fd965ebe 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -1,5 +1,5 @@ import copy -import re +import json from collections.abc import Mapping from datetime import datetime from typing import Any, TypeVar, cast @@ -9,8 +9,6 @@ from parcels._typing import PathLike -_ISO_DATETIME_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}") - _SUPPORTED_ATTR_TYPES = int | float | str | np.ndarray K = TypeVar("K") V = TypeVar("V") @@ -212,49 +210,53 @@ def from_xarray_dataset_dict(d) -> xr.Dataset: return xr.Dataset.from_dict(_fill_with_dummy_data(copy.deepcopy(d))) -def _datetimes_to_iso(obj: Any) -> Any: - """Recursively convert datetime objects to ISO format strings.""" - if isinstance(obj, datetime): - return obj.isoformat() - if isinstance(obj, list): - return [_datetimes_to_iso(v) for v in obj] - if isinstance(obj, dict): - return {k: _datetimes_to_iso(v) for k, v in obj.items()} - return obj +class _DatetimeEncoder(json.JSONEncoder): + """Convert all datetime objects within to be isoformat strings.""" + def default(self, o: Any) -> Any: + if isinstance(o, datetime): + return o.isoformat() + return super().default(o) -def _iso_to_datetimes(obj: Any) -> Any: - """Recursively convert ISO datetime strings back to datetime objects.""" - if isinstance(obj, str) and _ISO_DATETIME_RE.match(obj): - return datetime.fromisoformat(obj) - if isinstance(obj, list): - return [_iso_to_datetimes(v) for v in obj] - if isinstance(obj, dict): - return {k: _iso_to_datetimes(v) for k, v in obj.items()} - return obj +class _DatetimeDecoder(json.JSONDecoder): + """Convert all isoformat datetime strings within to be datetime objects.""" -def dataset_from_json(path: PathLike) -> xr.Dataset: - import json + def raw_decode(self, s: str, idx: int = 0) -> tuple[Any, int]: + obj, end = super().raw_decode(s, idx) + return self._convert(obj), end + + @staticmethod + def _convert(obj: Any) -> Any: + if isinstance(obj, str): + try: + return datetime.fromisoformat(obj) + except ValueError: + return obj + if isinstance(obj, list): + return [_DatetimeDecoder._convert(v) for v in obj] + if isinstance(obj, dict): + return {k: _DatetimeDecoder._convert(v) for k, v in obj.items()} + return obj - with open(path, "rb") as f: - d = json.load(f) + +def dataset_from_json(path: PathLike) -> xr.Dataset: + with open(path) as f: + d = json.load(f, cls=_DatetimeDecoder) assert d["version"] == "1", f"Version of TOML CDL representation must be '1'. Got {d['version']!r}" - ds_dict = _fill_with_dummy_data(_iso_to_datetimes(d["dataset"])) + ds_dict = _fill_with_dummy_data(d["dataset"]) return xr.Dataset.from_dict(ds_dict) def dataset_to_json(ds: xr.Dataset, path: PathLike) -> None: - import json - with open(path, "w") as f: d = { "version": "1", - "dataset": _datetimes_to_iso(_dataset_to_dict_with_coordinate_arrays(ds)), + "dataset": _dataset_to_dict_with_coordinate_arrays(ds), } - json.dump(d, f) + json.dump(d, f, cls=_DatetimeEncoder) return diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py index 32816646b..9f0655f8d 100644 --- a/tests/datasets/test_utils.py +++ b/tests/datasets/test_utils.py @@ -1,3 +1,6 @@ +import json +from datetime import datetime + import pytest import xarray as xr @@ -5,6 +8,14 @@ from parcels._datasets.structured.generic import datasets +def test_datetime_encoder_decoder_roundtrip(): + dt = datetime(2000, 1, 15, 12, 30, 45) + data = {"timestamps": [dt, dt], "nested": {"time": dt}, "value": 42} + encoded = json.dumps(data, cls=utils._DatetimeEncoder) + decoded = json.loads(encoded, cls=utils._DatetimeDecoder) + assert decoded == data + + def test_from_xarray_dataset_dict(): ds_expected = datasets["ds_2d_left"] d = ds_expected.to_dict(data=False) From d247a70a6500df75a0e7ac74cb55b7208ef10957 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Tue, 7 Apr 2026 14:15:46 +0200 Subject: [PATCH 05/29] Add note on cftime compat with error messaging --- src/parcels/_datasets/utils.py | 15 +++++++++++++-- tests/datasets/test_utils.py | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index 7fd965ebe..d1cee9b15 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -241,9 +241,10 @@ def _convert(obj: Any) -> Any: def dataset_from_json(path: PathLike) -> xr.Dataset: + """Read in a representative Xarray dataset from a JSON filed created by `dataset_to_json`.""" with open(path) as f: d = json.load(f, cls=_DatetimeDecoder) - assert d["version"] == "1", f"Version of TOML CDL representation must be '1'. Got {d['version']!r}" + assert d["version"] == "1", f"Version of Parcels JSON CDL representation must be '1'. Got {d['version']!r}" ds_dict = _fill_with_dummy_data(d["dataset"]) @@ -251,12 +252,22 @@ def dataset_from_json(path: PathLike) -> xr.Dataset: def dataset_to_json(ds: xr.Dataset, path: PathLike) -> None: + """Serialize a dataset to JSON with coordinate arrays. + + Does not support CFtime time coordinate. + """ with open(path, "w") as f: d = { "version": "1", "dataset": _dataset_to_dict_with_coordinate_arrays(ds), } - json.dump(d, f, cls=_DatetimeEncoder) + try: + json.dump(d, f, cls=_DatetimeEncoder) + except TypeError as e: + e.add_note( + "This function does not support CFtime time coordinates. Replace with datetime or float coordinates using (e.g., `ds['time'] = ...`)." + ) + raise e return diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py index 9f0655f8d..42d692941 100644 --- a/tests/datasets/test_utils.py +++ b/tests/datasets/test_utils.py @@ -28,9 +28,29 @@ def test_from_xarray_dataset_dict(): assert ds[k].attrs == ds_expected[k].attrs, f"Attrs for {k!r} do not match" +def _replace_with_cf_time(ds) -> xr.Dataset: + import cftime + + assert "time" in ds, "Dataset must have a dimension named 'time'" + ntime = 12 + ntime = min(ntime, len(ds.time.values)) + ds = ds.isel(time=slice(None, ntime)) + + dates = [cftime.DatetimeNoLeap(1, month, 1) for month in range(1, ntime + 1)] + ds["time"] = dates + return ds + + @pytest.mark.parametrize("ds", [pytest.param(v, id=k) for k, v in datasets.items()]) def test_dataset_json_roundtrip(ds: xr.Dataset, tmp_path): path = tmp_path / "dataset-metadata.json" utils.dataset_to_json(ds, path) ds_parsed = utils.dataset_from_json(path) # noqa: F841 # breakpoint() + + +@pytest.mark.parametrize("ds", [pytest.param(_replace_with_cf_time(datasets["ds_2d_left"]), id="cftime-ds_2d_left")]) +def test_dataset_json_errors_with_cftime(ds: xr.Dataset, tmp_path): + path = tmp_path / "dataset-metadata.json" + with pytest.raises(TypeError, match="Object of type Datetime.* is not JSON serializable"): + utils.dataset_to_json(ds, path) From ee0fa10555bba271f168129a0cc9850dd1810a30 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Tue, 7 Apr 2026 14:34:56 +0200 Subject: [PATCH 06/29] Update test --- tests/datasets/test_utils.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py index 42d692941..90bf91932 100644 --- a/tests/datasets/test_utils.py +++ b/tests/datasets/test_utils.py @@ -1,6 +1,7 @@ import json from datetime import datetime +import numpy as np import pytest import xarray as xr @@ -45,8 +46,20 @@ def _replace_with_cf_time(ds) -> xr.Dataset: def test_dataset_json_roundtrip(ds: xr.Dataset, tmp_path): path = tmp_path / "dataset-metadata.json" utils.dataset_to_json(ds, path) - ds_parsed = utils.dataset_from_json(path) # noqa: F841 - # breakpoint() + ds_parsed = utils.dataset_from_json(path) + + assert list(ds_parsed.coords) == list(ds.coords) + assert list(ds_parsed.data_vars) == list(ds.data_vars) + + for k in set(ds.data_vars): + assert ds_parsed[k].attrs == ds[k].attrs, f"Attrs for {k!r} do not match" + + for k in set(ds.coords): + assert ds_parsed[k].attrs == ds[k].attrs, f"Attrs for {k!r} do not match" + if isinstance(ds_parsed[k].dtype, np.dtypes.DateTime64DType): + np.testing.assert_equal(ds_parsed[k].values, ds[k].values) + else: + np.testing.assert_allclose(ds_parsed[k].values, ds[k].values) @pytest.mark.parametrize("ds", [pytest.param(_replace_with_cf_time(datasets["ds_2d_left"]), id="cftime-ds_2d_left")]) From f0b7b519db3b45b5fac83322329491f604939aeb Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Tue, 7 Apr 2026 14:52:23 +0200 Subject: [PATCH 07/29] Update names --- src/parcels/_datasets/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index d1cee9b15..791ac84cb 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -210,7 +210,7 @@ def from_xarray_dataset_dict(d) -> xr.Dataset: return xr.Dataset.from_dict(_fill_with_dummy_data(copy.deepcopy(d))) -class _DatetimeEncoder(json.JSONEncoder): +class _XarrayEncoder(json.JSONEncoder): """Convert all datetime objects within to be isoformat strings.""" def default(self, o: Any) -> Any: @@ -219,7 +219,7 @@ def default(self, o: Any) -> Any: return super().default(o) -class _DatetimeDecoder(json.JSONDecoder): +class _XarrayDecoder(json.JSONDecoder): """Convert all isoformat datetime strings within to be datetime objects.""" def raw_decode(self, s: str, idx: int = 0) -> tuple[Any, int]: @@ -234,16 +234,16 @@ def _convert(obj: Any) -> Any: except ValueError: return obj if isinstance(obj, list): - return [_DatetimeDecoder._convert(v) for v in obj] + return [_XarrayDecoder._convert(v) for v in obj] if isinstance(obj, dict): - return {k: _DatetimeDecoder._convert(v) for k, v in obj.items()} + return {k: _XarrayDecoder._convert(v) for k, v in obj.items()} return obj def dataset_from_json(path: PathLike) -> xr.Dataset: """Read in a representative Xarray dataset from a JSON filed created by `dataset_to_json`.""" with open(path) as f: - d = json.load(f, cls=_DatetimeDecoder) + d = json.load(f, cls=_XarrayDecoder) assert d["version"] == "1", f"Version of Parcels JSON CDL representation must be '1'. Got {d['version']!r}" ds_dict = _fill_with_dummy_data(d["dataset"]) @@ -262,7 +262,7 @@ def dataset_to_json(ds: xr.Dataset, path: PathLike) -> None: "dataset": _dataset_to_dict_with_coordinate_arrays(ds), } try: - json.dump(d, f, cls=_DatetimeEncoder) + json.dump(d, f, cls=_XarrayEncoder) except TypeError as e: e.add_note( "This function does not support CFtime time coordinates. Replace with datetime or float coordinates using (e.g., `ds['time'] = ...`)." From 18cb1fbb931a27a39ba7d21d07afed90b04cb5cf Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Tue, 7 Apr 2026 15:44:18 +0200 Subject: [PATCH 08/29] Add compression option --- src/parcels/_datasets/utils.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index 791ac84cb..052361b35 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -2,7 +2,9 @@ import json from collections.abc import Mapping from datetime import datetime -from typing import Any, TypeVar, cast +from functools import partial +from pathlib import Path +from typing import Any, Literal, TypeVar, cast import numpy as np import xarray as xr @@ -251,12 +253,31 @@ def dataset_from_json(path: PathLike) -> xr.Dataset: return xr.Dataset.from_dict(ds_dict) -def dataset_to_json(ds: xr.Dataset, path: PathLike) -> None: +def get_opener(mode: Literal["r", "w"], compressed: bool): + import gzip + + if compressed: + return partial(gzip.open, mode=f"{mode}t", encoding="utf-8") + else: + return partial(open, mode=mode) + + +def dataset_to_json(ds: xr.Dataset, path: PathLike, compressed=False) -> None: """Serialize a dataset to JSON with coordinate arrays. Does not support CFtime time coordinate. """ - with open(path, "w") as f: + path = Path(path) + if compressed: + if path.suffix != ".gz": + raise ValueError(f"Path suffix must be '.gz' . Got {path.suffix}") + else: + if path.suffix != ".json": + raise ValueError(f"Path suffix must be '.json' . Got {path.suffix}") + + _open = get_opener(mode="w", compressed=compressed) + + with _open(path) as f: d = { "version": "1", "dataset": _dataset_to_dict_with_coordinate_arrays(ds), @@ -286,7 +307,7 @@ def _decode_numpy_dict_values(attrs: Mapping[K, V]) -> dict[K, V]: def _dataset_to_dict_with_coordinate_arrays(ds: xr.Dataset) -> dict: # Implementation mostly copied from xr.Dataset.to_dict() - encoding = True + encoding = False d: dict = { "coords": {}, From b848ad3d5e3b2ff1d2af016494ce78f2f1ed4b08 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 10:53:16 +0200 Subject: [PATCH 09/29] TMP: Add script to compare dumping metadata file sizes --- dump-grid-data.py | 41 +++++++++++++++++++++++++++++++++++++++++ pixi.toml | 1 + 2 files changed, 42 insertions(+) create mode 100644 dump-grid-data.py diff --git a/dump-grid-data.py b/dump-grid-data.py new file mode 100644 index 000000000..b62f90788 --- /dev/null +++ b/dump-grid-data.py @@ -0,0 +1,41 @@ +import datetime +import os +from pathlib import Path + +import intake + +from parcels._datasets import utils +from parcels._datasets.structured.generic import datasets_sgrid + +cat = intake.open_catalog("../parcels-benchmarks/data/surf-data/parcels-benchmarks/catalog.yml") + + +def sizeof_fmt(num, suffix="B"): + for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"): + if abs(num) < 1024.0: + return f"{num:3.1f}{unit}{suffix}" + num /= 1024.0 + return f"{num:.1f}Yi{suffix}" + + +datasets = { + "example-dataset-from-parcels": datasets_sgrid["ds_2d_padded_low"], + "fesom_mesh": cat.fesom_baroclinic_gyre_mesh.to_dask(), + "moi_mesh": cat.moi_mesh.to_dask().set_coords(["glamf", "glamu"]), +} + +jsons_folder = Path("jsons") +jsons_folder.mkdir(exist_ok=True) +print("Uncompressed JSON representation of datasets") +print("============================================") +for k, ds in datasets.items(): + path = Path(f"jsons/{datetime.datetime.now().isoformat()}-{k}.json") + utils.dataset_to_json(ds, path) + print(f"Dataset {k} JSON representation is size: {sizeof_fmt(os.path.getsize(path)):>8}") +print() +print("Compressed JSON representation of datasets") +print("==========================================") +for k, ds in datasets.items(): + path = Path(f"jsons/{datetime.datetime.now().isoformat()}-{k}.json.gz") + utils.dataset_to_json(ds, path, compressed=True) + print(f"Dataset {k} JSON representation is size: {sizeof_fmt(os.path.getsize(path)):>8}") diff --git a/pixi.toml b/pixi.toml index 1e973434b..a54a1f83e 100644 --- a/pixi.toml +++ b/pixi.toml @@ -92,6 +92,7 @@ icecream = "*" ipykernel = "*" snoop = "*" pyinstrument = "*" +intake-xarray = ">=2.0.0,<3" [feature.devtools.target.linux-64.dependencies] memray = "*" From 67e63fb7a7648d786e247b7659c922a15b7c82be Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 13:49:58 +0200 Subject: [PATCH 10/29] Migrate to zarr compression --- dump-grid-data.py | 51 +++++++++++++++-------- src/parcels/_datasets/utils.py | 74 ++++------------------------------ 2 files changed, 41 insertions(+), 84 deletions(-) diff --git a/dump-grid-data.py b/dump-grid-data.py index b62f90788..ead91e268 100644 --- a/dump-grid-data.py +++ b/dump-grid-data.py @@ -3,9 +3,9 @@ from pathlib import Path import intake +import zarr from parcels._datasets import utils -from parcels._datasets.structured.generic import datasets_sgrid cat = intake.open_catalog("../parcels-benchmarks/data/surf-data/parcels-benchmarks/catalog.yml") @@ -19,23 +19,40 @@ def sizeof_fmt(num, suffix="B"): datasets = { - "example-dataset-from-parcels": datasets_sgrid["ds_2d_padded_low"], - "fesom_mesh": cat.fesom_baroclinic_gyre_mesh.to_dask(), + # "example-dataset-from-parcels": datasets_sgrid["ds_2d_padded_low"], + # "fesom_mesh": cat.fesom_baroclinic_gyre_mesh.to_dask(), "moi_mesh": cat.moi_mesh.to_dask().set_coords(["glamf", "glamu"]), } -jsons_folder = Path("jsons") -jsons_folder.mkdir(exist_ok=True) -print("Uncompressed JSON representation of datasets") -print("============================================") +zarrs_folder = Path("zarrs") +zarrs_folder.mkdir(exist_ok=True) for k, ds in datasets.items(): - path = Path(f"jsons/{datetime.datetime.now().isoformat()}-{k}.json") - utils.dataset_to_json(ds, path) - print(f"Dataset {k} JSON representation is size: {sizeof_fmt(os.path.getsize(path)):>8}") -print() -print("Compressed JSON representation of datasets") -print("==========================================") -for k, ds in datasets.items(): - path = Path(f"jsons/{datetime.datetime.now().isoformat()}-{k}.json.gz") - utils.dataset_to_json(ds, path, compressed=True) - print(f"Dataset {k} JSON representation is size: {sizeof_fmt(os.path.getsize(path)):>8}") + path = zarrs_folder / f"{datetime.datetime.now().isoformat()}-{k}.zip" + ds = ds.pipe(utils.strip_datavars) + nbytes_uncompressed_full_dataset = ds.nbytes + nbytes_uncompressed_coords = 0 + + for c in ds.coords: + nbytes_uncompressed_coords += ds.coords[c].nbytes + + ds.to_zarr(zarr.storage.ZipStore(path)) + + nbytes_compressed = os.path.getsize(path) + + print(r"Summary for dataset {k!r}") + print("=========================") + print(f"Original dataset uncompressed size: {sizeof_fmt(nbytes_uncompressed_full_dataset):>8}") + print(f"Coords dataset uncompressed size: {sizeof_fmt(nbytes_uncompressed_coords):>8}") + print(f"Compressed Zarr with coordinates: {sizeof_fmt(nbytes_compressed):>8}") + print("---") + print("Compressed dataset is:") + print(f" -{nbytes_compressed / nbytes_uncompressed_full_dataset:.1%} of original") + print(f" -{nbytes_compressed / nbytes_uncompressed_coords:.1%} of coordinate only") + +# print() +# print("Compressed JSON representation of datasets") +# print("==========================================") +# for k, ds in datasets.items(): +# path = Path(f"jsons/{datetime.datetime.now().isoformat()}-{k}.json.gz") +# utils.dataset_to_json(ds, path, compressed=True) +# print(f"Dataset {k} JSON representation is size: {sizeof_fmt(os.path.getsize(path)):>8}") diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index 052361b35..3517cd4fc 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -1,10 +1,8 @@ import copy import json -from collections.abc import Mapping from datetime import datetime from functools import partial -from pathlib import Path -from typing import Any, Literal, TypeVar, cast +from typing import Any, Literal, TypeVar import numpy as np import xarray as xr @@ -262,72 +260,14 @@ def get_opener(mode: Literal["r", "w"], compressed: bool): return partial(open, mode=mode) -def dataset_to_json(ds: xr.Dataset, path: PathLike, compressed=False) -> None: - """Serialize a dataset to JSON with coordinate arrays. +def strip_datavars(ds: xr.Dataset) -> xr.Dataset: + """Replace the data-variables with zeros. Leave the coordinates as-is.""" + import dask.array as da - Does not support CFtime time coordinate. - """ - path = Path(path) - if compressed: - if path.suffix != ".gz": - raise ValueError(f"Path suffix must be '.gz' . Got {path.suffix}") - else: - if path.suffix != ".json": - raise ValueError(f"Path suffix must be '.json' . Got {path.suffix}") - - _open = get_opener(mode="w", compressed=compressed) - - with _open(path) as f: - d = { - "version": "1", - "dataset": _dataset_to_dict_with_coordinate_arrays(ds), - } - try: - json.dump(d, f, cls=_XarrayEncoder) - except TypeError as e: - e.add_note( - "This function does not support CFtime time coordinates. Replace with datetime or float coordinates using (e.g., `ds['time'] = ...`)." - ) - raise e - return - - -def _decode_numpy_dict_values(attrs: Mapping[K, V]) -> dict[K, V]: - """Convert attribute values from numpy objects to native Python objects, - for use in to_dict - """ - attrs = dict(attrs) - for k, v in attrs.items(): - if isinstance(v, np.ndarray): - attrs[k] = cast(V, v.tolist()) - elif isinstance(v, np.generic): - attrs[k] = v.item() - return attrs - - -def _dataset_to_dict_with_coordinate_arrays(ds: xr.Dataset) -> dict: - # Implementation mostly copied from xr.Dataset.to_dict() - encoding = False - - d: dict = { - "coords": {}, - "attrs": _decode_numpy_dict_values(ds.attrs), - "dims": dict(ds.sizes), - "data_vars": {}, - } - for k in ds.coords: - d["coords"].update( - { - k: ds[k].variable.to_dict(data="list", encoding=encoding) - } # data='list' so coordinates are written to file - ) + ds = ds.copy() for k in ds.data_vars: - d["data_vars"].update( - {k: ds[k].variable.to_dict(data=False, encoding=encoding)} # data=False so that data isn't writen to file - ) - if encoding: - d["encoding"] = dict(ds.encoding) - return d + ds[k].data = da.zeros_like(ds[k].data) + return ds def _fill_with_dummy_data(d: dict[str, dict]): From e16cb4c06e68638c90e5533b7f6c2b37d41b0de2 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 14:30:34 +0200 Subject: [PATCH 11/29] Fix script --- dump-grid-data.py | 34 +++++++++++++++------------------- src/parcels/_datasets/utils.py | 16 +++++++++++++--- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/dump-grid-data.py b/dump-grid-data.py index ead91e268..25ffb60a2 100644 --- a/dump-grid-data.py +++ b/dump-grid-data.py @@ -6,6 +6,7 @@ import zarr from parcels._datasets import utils +from parcels._datasets.structured.generic import datasets_sgrid cat = intake.open_catalog("../parcels-benchmarks/data/surf-data/parcels-benchmarks/catalog.yml") @@ -19,40 +20,35 @@ def sizeof_fmt(num, suffix="B"): datasets = { - # "example-dataset-from-parcels": datasets_sgrid["ds_2d_padded_low"], - # "fesom_mesh": cat.fesom_baroclinic_gyre_mesh.to_dask(), - "moi_mesh": cat.moi_mesh.to_dask().set_coords(["glamf", "glamu"]), + "example-dataset-from-parcels": (datasets_sgrid["ds_2d_padded_low"], []), + "fesom_mesh": (cat.fesom_baroclinic_gyre_mesh.to_dask(), []), + "moi_mesh": (cat.moi_mesh.to_dask(), ["glamf", "glamu"]), } zarrs_folder = Path("zarrs") zarrs_folder.mkdir(exist_ok=True) -for k, ds in datasets.items(): +for k, (ds, except_for) in datasets.items(): path = zarrs_folder / f"{datetime.datetime.now().isoformat()}-{k}.zip" - ds = ds.pipe(utils.strip_datavars) + ds = ds.pipe(utils.replace_data_vars_with_zeros, except_for=except_for) nbytes_uncompressed_full_dataset = ds.nbytes - nbytes_uncompressed_coords = 0 + nbytes_uncompressed_trimmed = 0 for c in ds.coords: - nbytes_uncompressed_coords += ds.coords[c].nbytes + nbytes_uncompressed_trimmed += ds[c].nbytes + for d in ds.data_vars: + if d in except_for: + nbytes_uncompressed_trimmed += ds[d].nbytes ds.to_zarr(zarr.storage.ZipStore(path)) nbytes_compressed = os.path.getsize(path) - print(r"Summary for dataset {k!r}") + print(f"Summary for dataset {k!r}") print("=========================") print(f"Original dataset uncompressed size: {sizeof_fmt(nbytes_uncompressed_full_dataset):>8}") - print(f"Coords dataset uncompressed size: {sizeof_fmt(nbytes_uncompressed_coords):>8}") + print(f"Trimmed dataset uncompressed size: {sizeof_fmt(nbytes_uncompressed_trimmed):>8}") print(f"Compressed Zarr with coordinates: {sizeof_fmt(nbytes_compressed):>8}") print("---") - print("Compressed dataset is:") + print("Timmed compressed dataset is:") print(f" -{nbytes_compressed / nbytes_uncompressed_full_dataset:.1%} of original") - print(f" -{nbytes_compressed / nbytes_uncompressed_coords:.1%} of coordinate only") - -# print() -# print("Compressed JSON representation of datasets") -# print("==========================================") -# for k, ds in datasets.items(): -# path = Path(f"jsons/{datetime.datetime.now().isoformat()}-{k}.json.gz") -# utils.dataset_to_json(ds, path, compressed=True) -# print(f"Dataset {k} JSON representation is size: {sizeof_fmt(os.path.getsize(path)):>8}") + print(f" -{nbytes_compressed / nbytes_uncompressed_trimmed:.1%} of trimmed uncompressed") diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index 3517cd4fc..1802fa7dc 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -260,12 +260,22 @@ def get_opener(mode: Literal["r", "w"], compressed: bool): return partial(open, mode=mode) -def strip_datavars(ds: xr.Dataset) -> xr.Dataset: - """Replace the data-variables with zeros. Leave the coordinates as-is.""" +def replace_data_vars_with_zeros(ds: xr.Dataset, except_for: list[str] | None = None) -> xr.Dataset: + """Replace datavars in the xarray dataset with with zeros, except for some. + + If except_for is None: + - Replace all non-coordinate arrays with zeros + + If except_for is not None: + - Exclude items listed from the replacement + """ import dask.array as da + if except_for is None: + except_for = [] + ds = ds.copy() - for k in ds.data_vars: + for k in set(ds.data_vars) - set(except_for): ds[k].data = da.zeros_like(ds[k].data) return ds From bc29541a8504e1d308cc12f75263414bac3c9bff Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 14:59:47 +0200 Subject: [PATCH 12/29] Update script --- dump-grid-data.py | 2 +- src/parcels/_datasets/utils.py | 73 +++++----------------------------- 2 files changed, 12 insertions(+), 63 deletions(-) diff --git a/dump-grid-data.py b/dump-grid-data.py index 25ffb60a2..8ff55b8ed 100644 --- a/dump-grid-data.py +++ b/dump-grid-data.py @@ -29,7 +29,7 @@ def sizeof_fmt(num, suffix="B"): zarrs_folder.mkdir(exist_ok=True) for k, (ds, except_for) in datasets.items(): path = zarrs_folder / f"{datetime.datetime.now().isoformat()}-{k}.zip" - ds = ds.pipe(utils.replace_data_vars_with_zeros, except_for=except_for) + ds = ds.pipe(utils.replace_arrays_with_zeros, except_for=except_for) nbytes_uncompressed_full_dataset = ds.nbytes nbytes_uncompressed_trimmed = 0 diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index 1802fa7dc..375525b8f 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -1,14 +1,10 @@ import copy -import json -from datetime import datetime -from functools import partial +from collections.abc import Hashable from typing import Any, Literal, TypeVar import numpy as np import xarray as xr -from parcels._typing import PathLike - _SUPPORTED_ATTR_TYPES = int | float | str | np.ndarray K = TypeVar("K") V = TypeVar("V") @@ -210,72 +206,25 @@ def from_xarray_dataset_dict(d) -> xr.Dataset: return xr.Dataset.from_dict(_fill_with_dummy_data(copy.deepcopy(d))) -class _XarrayEncoder(json.JSONEncoder): - """Convert all datetime objects within to be isoformat strings.""" - - def default(self, o: Any) -> Any: - if isinstance(o, datetime): - return o.isoformat() - return super().default(o) - - -class _XarrayDecoder(json.JSONDecoder): - """Convert all isoformat datetime strings within to be datetime objects.""" - - def raw_decode(self, s: str, idx: int = 0) -> tuple[Any, int]: - obj, end = super().raw_decode(s, idx) - return self._convert(obj), end - - @staticmethod - def _convert(obj: Any) -> Any: - if isinstance(obj, str): - try: - return datetime.fromisoformat(obj) - except ValueError: - return obj - if isinstance(obj, list): - return [_XarrayDecoder._convert(v) for v in obj] - if isinstance(obj, dict): - return {k: _XarrayDecoder._convert(v) for k, v in obj.items()} - return obj - - -def dataset_from_json(path: PathLike) -> xr.Dataset: - """Read in a representative Xarray dataset from a JSON filed created by `dataset_to_json`.""" - with open(path) as f: - d = json.load(f, cls=_XarrayDecoder) - assert d["version"] == "1", f"Version of Parcels JSON CDL representation must be '1'. Got {d['version']!r}" - - ds_dict = _fill_with_dummy_data(d["dataset"]) - - return xr.Dataset.from_dict(ds_dict) - - -def get_opener(mode: Literal["r", "w"], compressed: bool): - import gzip - - if compressed: - return partial(gzip.open, mode=f"{mode}t", encoding="utf-8") - else: - return partial(open, mode=mode) - - -def replace_data_vars_with_zeros(ds: xr.Dataset, except_for: list[str] | None = None) -> xr.Dataset: +def replace_arrays_with_zeros( + ds: xr.Dataset, except_for: Literal["coords"] | list[Hashable] | None = None +) -> xr.Dataset: """Replace datavars in the xarray dataset with with zeros, except for some. - If except_for is None: - - Replace all non-coordinate arrays with zeros - - If except_for is not None: - - Exclude items listed from the replacement + except_for options: + - except_for=None: Replace all arrays with zeros + - except_for='coords': Replace all arrays with zeros except the coords + - except_for=[...]: Provide list of items to exclude """ import dask.array as da if except_for is None: except_for = [] + if except_for == "coords": + except_for = list(ds.coords.keys()) ds = ds.copy() - for k in set(ds.data_vars) - set(except_for): + for k in set(ds.data_vars) | set(ds.coords) - set(except_for): ds[k].data = da.zeros_like(ds[k].data) return ds From 46168980a82bcf83cd0479f10817323ac1bc1364 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 15:02:56 +0200 Subject: [PATCH 13/29] Add informative error messaging --- src/parcels/_datasets/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index 375525b8f..b8a8fbd96 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -224,8 +224,14 @@ def replace_arrays_with_zeros( except_for = list(ds.coords.keys()) ds = ds.copy() - for k in set(ds.data_vars) | set(ds.coords) - set(except_for): + ds_keys = set(ds.data_vars) | set(ds.coords) + for k in except_for: + if k not in ds_keys: + raise ValueError(f"Item {k!r} in `except_for` not a valid item in dataset. Got {except_for=!r}.") + + for k in ds_keys - set(except_for): ds[k].data = da.zeros_like(ds[k].data) + return ds From 9f191b63420ec4b87507248df179f699f6500d1c Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 15:09:14 +0200 Subject: [PATCH 14/29] Update tests --- tests/datasets/test_utils.py | 90 +++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py index 90bf91932..f4aa19bd1 100644 --- a/tests/datasets/test_utils.py +++ b/tests/datasets/test_utils.py @@ -1,6 +1,3 @@ -import json -from datetime import datetime - import numpy as np import pytest import xarray as xr @@ -9,14 +6,6 @@ from parcels._datasets.structured.generic import datasets -def test_datetime_encoder_decoder_roundtrip(): - dt = datetime(2000, 1, 15, 12, 30, 45) - data = {"timestamps": [dt, dt], "nested": {"time": dt}, "value": 42} - encoded = json.dumps(data, cls=utils._DatetimeEncoder) - decoded = json.loads(encoded, cls=utils._DatetimeDecoder) - assert decoded == data - - def test_from_xarray_dataset_dict(): ds_expected = datasets["ds_2d_left"] d = ds_expected.to_dict(data=False) @@ -42,28 +31,67 @@ def _replace_with_cf_time(ds) -> xr.Dataset: return ds -@pytest.mark.parametrize("ds", [pytest.param(v, id=k) for k, v in datasets.items()]) -def test_dataset_json_roundtrip(ds: xr.Dataset, tmp_path): - path = tmp_path / "dataset-metadata.json" - utils.dataset_to_json(ds, path) - ds_parsed = utils.dataset_from_json(path) +@pytest.fixture +def nonzero_ds(): + """Small dataset with nonzero data_vars and non-index coords for replace_arrays_with_zeros tests. + + Uses 2D lon/lat as coords so they are regular (non-index) variables that can be zeroed. + """ + import dask.array as da + + lon = np.array([[1.0, 2.0, 3.0, 4.0]] * 3) + lat = np.array([[10.0] * 4, [20.0] * 4, [30.0] * 4]) + return xr.Dataset( + { + "U": (["y", "x"], da.from_array(np.ones((3, 4)), chunks=-1)), + "V": (["y", "x"], da.from_array(np.full((3, 4), 2.0), chunks=-1)), + }, + coords={ + "lon": (["y", "x"], da.from_array(lon, chunks=-1)), + "lat": (["y", "x"], da.from_array(lat, chunks=-1)), + }, + ) + + +def test_replace_arrays_with_zeros_none(nonzero_ds): + """except_for=None: all data_vars and coords replaced with zeros.""" + result = utils.replace_arrays_with_zeros(nonzero_ds, except_for=None) + + for k in set(result.data_vars) | set(result.coords): + assert np.all(result[k].values == 0), f"{k!r} should be zero" + + +def test_replace_arrays_with_zeros_coords(nonzero_ds): + """except_for='coords': data_vars zeroed, coords preserved.""" + result = utils.replace_arrays_with_zeros(nonzero_ds, except_for="coords") + + for k in result.data_vars: + assert np.all(result[k].values == 0), f"data_var {k!r} should be zero" + + np.testing.assert_array_equal(result["lon"].values, nonzero_ds["lon"].values) + np.testing.assert_array_equal(result["lat"].values, nonzero_ds["lat"].values) + + +def test_replace_arrays_with_zeros_list(nonzero_ds): + """except_for=[...]: listed variables preserved, others zeroed.""" + result = utils.replace_arrays_with_zeros(nonzero_ds, except_for=["U", "lon"]) - assert list(ds_parsed.coords) == list(ds.coords) - assert list(ds_parsed.data_vars) == list(ds.data_vars) + np.testing.assert_array_equal(result["U"].values, nonzero_ds["U"].values) + np.testing.assert_array_equal(result["lon"].values, nonzero_ds["lon"].values) + assert np.all(result["V"].values == 0), "V should be zero" + assert np.all(result["lat"].values == 0), "lat should be zero" - for k in set(ds.data_vars): - assert ds_parsed[k].attrs == ds[k].attrs, f"Attrs for {k!r} do not match" - for k in set(ds.coords): - assert ds_parsed[k].attrs == ds[k].attrs, f"Attrs for {k!r} do not match" - if isinstance(ds_parsed[k].dtype, np.dtypes.DateTime64DType): - np.testing.assert_equal(ds_parsed[k].values, ds[k].values) - else: - np.testing.assert_allclose(ds_parsed[k].values, ds[k].values) +def test_replace_arrays_with_zeros_does_not_mutate(nonzero_ds): + """Original dataset is not modified.""" + original_U = nonzero_ds["U"].values.copy() + original_lon = nonzero_ds["lon"].values.copy() + utils.replace_arrays_with_zeros(nonzero_ds, except_for=None) + np.testing.assert_array_equal(nonzero_ds["U"].values, original_U) + np.testing.assert_array_equal(nonzero_ds["lon"].values, original_lon) -@pytest.mark.parametrize("ds", [pytest.param(_replace_with_cf_time(datasets["ds_2d_left"]), id="cftime-ds_2d_left")]) -def test_dataset_json_errors_with_cftime(ds: xr.Dataset, tmp_path): - path = tmp_path / "dataset-metadata.json" - with pytest.raises(TypeError, match="Object of type Datetime.* is not JSON serializable"): - utils.dataset_to_json(ds, path) +def test_replace_arrays_with_zeros_invalid_key(nonzero_ds): + """Invalid key in except_for raises ValueError.""" + with pytest.raises(ValueError, match="not a valid item"): + utils.replace_arrays_with_zeros(nonzero_ds, except_for=["nonexistent"]) From 03357aef34398f040891c397c7ae2e28fa9db9ee Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 15:32:32 +0200 Subject: [PATCH 15/29] Update `posting-issues.md` doc page --- .github/ISSUE_TEMPLATE/02_bug.yaml | 2 +- docs/development/posting-issues.md | 75 +++++++++++++++++++++++------- 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/02_bug.yaml b/.github/ISSUE_TEMPLATE/02_bug.yaml index 53f0a4848..e21f384dc 100644 --- a/.github/ISSUE_TEMPLATE/02_bug.yaml +++ b/.github/ISSUE_TEMPLATE/02_bug.yaml @@ -17,7 +17,7 @@ body: - type: "textarea" attributes: label: "Code sample" - description: "If relevant, please provide a code example where this bug is shown as well as any error message. A [minimal, reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) is preffered as it makes it much easier for developers to identify the cause of the bug. This also allows them quickly determine whether the problem is with your code or with Parcels itself. If you want support on a specific dataset, please [follow our instructions on how to share dataset metadata](https://docs.parcels-code.org/en/main/development/posting-issues.html)" + description: "If relevant, please provide a code example where this bug is shown as well as any error message. A [minimal, reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) is preffered as it makes it much easier for developers to identify the cause of the bug. This also allows them quickly determine whether the problem is with your code or with Parcels itself. If you want support on a specific dataset, please [follow our instructions on how to share representative datasets](https://docs.parcels-code.org/en/main/development/posting-issues.html)" value: | ```python # Paste your code within this block diff --git a/docs/development/posting-issues.md b/docs/development/posting-issues.md index 6edd911d3..15b8636c4 100644 --- a/docs/development/posting-issues.md +++ b/docs/development/posting-issues.md @@ -20,51 +20,90 @@ Following these templates provides structure and ensures that we have all the ne Parcels is designed to work with a large range of input datasets. When extending support for various input datasets, or trying to debug problems -that only occur with specific datasets, having the dataset metadata is very valuable. +that only occur with specific datasets, having access to your dataset (or a +close representation of it) is very valuable. -This metadata could include information such as: +This could include information such as: - the nature of the array variables (e.g., via CF compliant metadata) - descriptions about the origin of the dataset, or additional comments - the shapes and data types of the arrays +- the grid topology (coordinates and key variables) This also allows us to see if your metadata is broken/non-compliant with standards - where we can then suggest fixes for you (and maybe we can tell the data provider!). Since version 4 of Parcels we rely much more on metadata to discover information about your input data. -Sharing this metadata often provides enough debugging information to solve your problem, instead of having to share a whole dataset. +Sharing a compact representation of your dataset often provides enough information to solve your problem, without having to share the full dataset (which may be very large or contain sensitive data). -Sharing dataset metadata is made easy in Parcels. +Parcels makes this easy by replacing irrelevant array data with zeros and saving the result as a compressed Zarr zip store, which is typically small enough to attach directly to a GitHub issue. ### Step 1. Users As a user with access to your dataset, you would do: ```{code-cell} -import json +:tags: [hide-cell] import xarray as xr +from parcels._datasets.structured.generic import datasets +datasets['ds_2d_left'].to_netcdf("my_dataset.nc") +``` + +```{code-cell} +import os + +import xarray as xr +import zarr + +from parcels._datasets.utils import replace_arrays_with_zeros + +# load your dataset +ds = xr.open_dataset("my_dataset.nc") # or xr.open_zarr(...), etc. + +# Replace all data arrays with zeros, keeping coordinate metadata. +# This keeps array shapes and metadata while removing actual data. +# +# You can customise `except_for` to also retain actual values for specific variables: +# except_for='coords' — keep coordinate arrays (useful for grid topology) +# except_for=['lon', 'lat'] — keep a specific list of variables +ds_trimmed = replace_arrays_with_zeros(ds) # default: except_for=None -# defining an example dataset to illustrate -# (you would use `xr.open_dataset(...)` instead) -ds = xr.Dataset(attrs={"description": "my dataset"}) +# Save to a zipped Zarr store +output_file = "my_dataset.zip" +ds_trimmed.to_zarr(zarr.storage.ZipStore(output_file)) -output_file = "my_dataset.json" -with open(output_file, "w") as f: - json.dump(ds.to_dict(data=False), f) # write your dataset to a JSON excluding array data +# Check the file size (aim for < 25 MB so it can be attached to a GitHub issue) +size_mb = os.path.getsize(output_file) / 1e6 +print(f"Zip store size: {size_mb:.1f} MB") ``` -Then attach the JSON file written above alongside your issue +Then attach the zip file written above alongside your issue. + +If the file is larger than 25 MB, try passing `except_for=None` (the default) +to ensure all arrays are zeroed out. If it is still too large, consider +subsetting your dataset to a smaller spatial or temporal region before saving. ### Step 2. Maintainers and developers -As developers looking to inspect the metadata, we would do: +As developers looking to inspect the dataset, we would do: ```{code-cell} -from parcels._datasets.utils import from_xarray_dataset_dict +import xarray as xr +import zarr + +ds = xr.open_zarr(zarr.storage.ZipStore("my_dataset.zip")) +ds +``` + +```{code-cell} +:tags: [hide-cell] + +del ds +from pathlib import Path +Path("my_dataset.zip").unlink() +Path("my_dataset.nc").unlink() -with open(output_file) as f: - d = json.load(f) -ds = from_xarray_dataset_dict(d) ``` -From there we can take a look the metadata of your dataset! +From there we can take a look at the structure and metadata of your dataset! +This also makes it straightforward for us to add this dataset to our test suite. From f633c6a52c2a4c11b6cc91ca89cc0030539f99f5 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:01:23 +0200 Subject: [PATCH 16/29] Update array replacement --- src/parcels/_datasets/utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index b8a8fbd96..80a02105c 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -213,7 +213,7 @@ def replace_arrays_with_zeros( except_for options: - except_for=None: Replace all arrays with zeros - - except_for='coords': Replace all arrays with zeros except the coords + - except_for='coords': Replace all arrays with zeros except the non-index coords - except_for=[...]: Provide list of items to exclude """ import dask.array as da @@ -230,7 +230,12 @@ def replace_arrays_with_zeros( raise ValueError(f"Item {k!r} in `except_for` not a valid item in dataset. Got {except_for=!r}.") for k in ds_keys - set(except_for): - ds[k].data = da.zeros_like(ds[k].data) + data = da.zeros_like(ds[k].data) + try: + ds[k].data = data + except ValueError: + # Cannot assign to dimension coordinate, leave as is + pass return ds From 074bd49441477ffc2ebdde84a9dffb2cc3948963 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:26:59 +0200 Subject: [PATCH 17/29] fix --- docs/development/posting-issues.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/development/posting-issues.md b/docs/development/posting-issues.md index 15b8636c4..2a76fd7ec 100644 --- a/docs/development/posting-issues.md +++ b/docs/development/posting-issues.md @@ -68,12 +68,12 @@ ds = xr.open_dataset("my_dataset.nc") # or xr.open_zarr(...), etc. # except_for=['lon', 'lat'] — keep a specific list of variables ds_trimmed = replace_arrays_with_zeros(ds) # default: except_for=None -# Save to a zipped Zarr store -output_file = "my_dataset.zip" -ds_trimmed.to_zarr(zarr.storage.ZipStore(output_file)) +# Save to a zipped Zarr store - replace `my_dataset` with a more informative name +with zarr.storage.ZipStore("my_dataset.zip", mode='w') as store: + ds_trimmed.to_zarr(store) # Check the file size (aim for < 25 MB so it can be attached to a GitHub issue) -size_mb = os.path.getsize(output_file) / 1e6 +size_mb = os.path.getsize("my_dataset.zip") / 1e6 print(f"Zip store size: {size_mb:.1f} MB") ``` @@ -91,7 +91,7 @@ As developers looking to inspect the dataset, we would do: import xarray as xr import zarr -ds = xr.open_zarr(zarr.storage.ZipStore("my_dataset.zip")) +ds = xr.open_zarr(zarr.storage.ZipStore("my_dataset.zip", mode="r")) ds ``` From 18a845f0cb645b2ad012a379c58e4565a9dd3277 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:42:04 +0200 Subject: [PATCH 18/29] Remove script --- dump-grid-data.py | 54 ----------------------------------------------- 1 file changed, 54 deletions(-) delete mode 100644 dump-grid-data.py diff --git a/dump-grid-data.py b/dump-grid-data.py deleted file mode 100644 index 8ff55b8ed..000000000 --- a/dump-grid-data.py +++ /dev/null @@ -1,54 +0,0 @@ -import datetime -import os -from pathlib import Path - -import intake -import zarr - -from parcels._datasets import utils -from parcels._datasets.structured.generic import datasets_sgrid - -cat = intake.open_catalog("../parcels-benchmarks/data/surf-data/parcels-benchmarks/catalog.yml") - - -def sizeof_fmt(num, suffix="B"): - for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"): - if abs(num) < 1024.0: - return f"{num:3.1f}{unit}{suffix}" - num /= 1024.0 - return f"{num:.1f}Yi{suffix}" - - -datasets = { - "example-dataset-from-parcels": (datasets_sgrid["ds_2d_padded_low"], []), - "fesom_mesh": (cat.fesom_baroclinic_gyre_mesh.to_dask(), []), - "moi_mesh": (cat.moi_mesh.to_dask(), ["glamf", "glamu"]), -} - -zarrs_folder = Path("zarrs") -zarrs_folder.mkdir(exist_ok=True) -for k, (ds, except_for) in datasets.items(): - path = zarrs_folder / f"{datetime.datetime.now().isoformat()}-{k}.zip" - ds = ds.pipe(utils.replace_arrays_with_zeros, except_for=except_for) - nbytes_uncompressed_full_dataset = ds.nbytes - nbytes_uncompressed_trimmed = 0 - - for c in ds.coords: - nbytes_uncompressed_trimmed += ds[c].nbytes - for d in ds.data_vars: - if d in except_for: - nbytes_uncompressed_trimmed += ds[d].nbytes - - ds.to_zarr(zarr.storage.ZipStore(path)) - - nbytes_compressed = os.path.getsize(path) - - print(f"Summary for dataset {k!r}") - print("=========================") - print(f"Original dataset uncompressed size: {sizeof_fmt(nbytes_uncompressed_full_dataset):>8}") - print(f"Trimmed dataset uncompressed size: {sizeof_fmt(nbytes_uncompressed_trimmed):>8}") - print(f"Compressed Zarr with coordinates: {sizeof_fmt(nbytes_compressed):>8}") - print("---") - print("Timmed compressed dataset is:") - print(f" -{nbytes_compressed / nbytes_uncompressed_full_dataset:.1%} of original") - print(f" -{nbytes_compressed / nbytes_uncompressed_trimmed:.1%} of trimmed uncompressed") From d9ad51ca4e7ebdf44662394a8ab6d02623cd401f Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:46:19 +0200 Subject: [PATCH 19/29] Add test against different datasets For robustness --- tests/datasets/test_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py index f4aa19bd1..4c34bdd3f 100644 --- a/tests/datasets/test_utils.py +++ b/tests/datasets/test_utils.py @@ -52,6 +52,10 @@ def nonzero_ds(): }, ) +@pytest.mark.parametrize("ds", [pytest.param(v, id=k) for k,v in datasets.items()]) +@pytest.mark.parametrize("except_for", [None, "coords"]) +def test_replace_arrays_with_zeros(ds, except_for): + utils.replace_arrays_with_zeros(ds, except_for=except_for) def test_replace_arrays_with_zeros_none(nonzero_ds): """except_for=None: all data_vars and coords replaced with zeros.""" From 975dd75a9079d5eed1500834474e20224060f7bc Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:47:01 +0200 Subject: [PATCH 20/29] Remove from_xarray_dataset_dict --- src/parcels/_datasets/utils.py | 18 ------------------ tests/datasets/test_utils.py | 16 +++------------- 2 files changed, 3 insertions(+), 31 deletions(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index 80a02105c..09a82e88e 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -1,4 +1,3 @@ -import copy from collections.abc import Hashable from typing import Any, Literal, TypeVar @@ -189,23 +188,6 @@ def verbose_print(*args, **kwargs): verbose_print("=" * 30 + " End of Comparison " + "=" * 30) -def from_xarray_dataset_dict(d) -> xr.Dataset: - """Reconstruct a dataset with zero data from the output of ``xarray.Dataset.to_dict(data=False)``. - - Useful in issues helping users debug fieldsets - sharing dataset schemas with associated metadata - without sharing the data itself. - - Example - ------- - >>> import xarray as xr - >>> from parcels._datasets.structured.generic import datasets - >>> ds = datasets['ds_2d_left'] - >>> d = ds.to_dict(data=False) - >>> ds2 = from_xarray_dataset_dict(d) - """ - return xr.Dataset.from_dict(_fill_with_dummy_data(copy.deepcopy(d))) - - def replace_arrays_with_zeros( ds: xr.Dataset, except_for: Literal["coords"] | list[Hashable] | None = None ) -> xr.Dataset: diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py index 4c34bdd3f..46019b909 100644 --- a/tests/datasets/test_utils.py +++ b/tests/datasets/test_utils.py @@ -6,18 +6,6 @@ from parcels._datasets.structured.generic import datasets -def test_from_xarray_dataset_dict(): - ds_expected = datasets["ds_2d_left"] - d = ds_expected.to_dict(data=False) - ds = utils.from_xarray_dataset_dict(d) - - assert list(ds.coords) == list(ds_expected.coords) - assert list(ds.data_vars) == list(ds_expected.data_vars) - - for k in set(ds.coords) | set(ds.data_vars): - assert ds[k].attrs == ds_expected[k].attrs, f"Attrs for {k!r} do not match" - - def _replace_with_cf_time(ds) -> xr.Dataset: import cftime @@ -52,11 +40,13 @@ def nonzero_ds(): }, ) -@pytest.mark.parametrize("ds", [pytest.param(v, id=k) for k,v in datasets.items()]) + +@pytest.mark.parametrize("ds", [pytest.param(v, id=k) for k, v in datasets.items()]) @pytest.mark.parametrize("except_for", [None, "coords"]) def test_replace_arrays_with_zeros(ds, except_for): utils.replace_arrays_with_zeros(ds, except_for=except_for) + def test_replace_arrays_with_zeros_none(nonzero_ds): """except_for=None: all data_vars and coords replaced with zeros.""" result = utils.replace_arrays_with_zeros(nonzero_ds, except_for=None) From 5b9aa91a6975d202f30fb7d3e140399c8f637d6c Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:48:27 +0200 Subject: [PATCH 21/29] copy --- docs/development/posting-issues.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/development/posting-issues.md b/docs/development/posting-issues.md index 2a76fd7ec..e7c68b216 100644 --- a/docs/development/posting-issues.md +++ b/docs/development/posting-issues.md @@ -105,5 +105,5 @@ Path("my_dataset.nc").unlink() ``` -From there we can take a look at the structure and metadata of your dataset! +From there we can take a look at the structure, metadata, and grid topology of your dataset! This also makes it straightforward for us to add this dataset to our test suite. From b758bee8b684bd71293725c6ba72abbb3170975c Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:53:18 +0200 Subject: [PATCH 22/29] Add comments --- docs/development/posting-issues.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/development/posting-issues.md b/docs/development/posting-issues.md index e7c68b216..5116d8354 100644 --- a/docs/development/posting-issues.md +++ b/docs/development/posting-issues.md @@ -44,6 +44,7 @@ As a user with access to your dataset, you would do: ```{code-cell} :tags: [hide-cell] +# Generate an example dataset to zip. The user would use their own. import xarray as xr from parcels._datasets.structured.generic import datasets datasets['ds_2d_left'].to_netcdf("my_dataset.nc") @@ -98,6 +99,7 @@ ds ```{code-cell} :tags: [hide-cell] +# Cleanup files in doc build process del ds from pathlib import Path Path("my_dataset.zip").unlink() From b8678a9cbbac15cb5eae8b56c0f9189255bd0d51 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:55:10 +0200 Subject: [PATCH 23/29] Update docstring --- src/parcels/_datasets/utils.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index 09a82e88e..c3f4f0b80 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -191,12 +191,23 @@ def verbose_print(*args, **kwargs): def replace_arrays_with_zeros( ds: xr.Dataset, except_for: Literal["coords"] | list[Hashable] | None = None ) -> xr.Dataset: - """Replace datavars in the xarray dataset with with zeros, except for some. - - except_for options: - - except_for=None: Replace all arrays with zeros - - except_for='coords': Replace all arrays with zeros except the non-index coords - - except_for=[...]: Provide list of items to exclude + """Replace datavars in the xarray dataset with zeros, except for some. + + Parameters + ---------- + ds : xr.Dataset + The dataset whose arrays will be replaced with zeros. + except_for : "coords" or list of Hashable or None, optional + Controls which arrays are preserved: + + - ``None``: Replace all arrays with zeros. + - ``"coords"``: Replace all arrays with zeros except the non-index coords. + - list: Provide a list of variable/coord names to exclude from zeroing. + + Returns + ------- + xr.Dataset + A copy of ``ds`` with the selected arrays replaced by zeros. """ import dask.array as da From c64dacecae51be428a329367f86acbdbcb7c597c Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:55:39 +0200 Subject: [PATCH 24/29] Remove typevars --- src/parcels/_datasets/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/parcels/_datasets/utils.py b/src/parcels/_datasets/utils.py index c3f4f0b80..505ab5b0a 100644 --- a/src/parcels/_datasets/utils.py +++ b/src/parcels/_datasets/utils.py @@ -1,12 +1,10 @@ from collections.abc import Hashable -from typing import Any, Literal, TypeVar +from typing import Any, Literal import numpy as np import xarray as xr _SUPPORTED_ATTR_TYPES = int | float | str | np.ndarray -K = TypeVar("K") -V = TypeVar("V") def _print_mismatched_keys(d1: dict[Any, Any], d2: dict[Any, Any]) -> None: From ebb0a7ab44b1ffc94e80b0adb58588433f5fbd7d Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:57:01 +0200 Subject: [PATCH 25/29] Remove unused helper --- tests/datasets/test_utils.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py index 46019b909..828f22359 100644 --- a/tests/datasets/test_utils.py +++ b/tests/datasets/test_utils.py @@ -6,19 +6,6 @@ from parcels._datasets.structured.generic import datasets -def _replace_with_cf_time(ds) -> xr.Dataset: - import cftime - - assert "time" in ds, "Dataset must have a dimension named 'time'" - ntime = 12 - ntime = min(ntime, len(ds.time.values)) - ds = ds.isel(time=slice(None, ntime)) - - dates = [cftime.DatetimeNoLeap(1, month, 1) for month in range(1, ntime + 1)] - ds["time"] = dates - return ds - - @pytest.fixture def nonzero_ds(): """Small dataset with nonzero data_vars and non-index coords for replace_arrays_with_zeros tests. From 3566c3f6cc648a41d41ced5a6f4d5befe344513b Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:57:18 +0200 Subject: [PATCH 26/29] Remove dep was only added for testing --- pixi.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pixi.toml b/pixi.toml index a54a1f83e..1e973434b 100644 --- a/pixi.toml +++ b/pixi.toml @@ -92,7 +92,6 @@ icecream = "*" ipykernel = "*" snoop = "*" pyinstrument = "*" -intake-xarray = ">=2.0.0,<3" [feature.devtools.target.linux-64.dependencies] memray = "*" From 1e568242ad717423c94bff2e38146fc5faf7d6a2 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:58:30 +0200 Subject: [PATCH 27/29] Add comment --- tests/datasets/test_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/datasets/test_utils.py b/tests/datasets/test_utils.py index 828f22359..b376283e3 100644 --- a/tests/datasets/test_utils.py +++ b/tests/datasets/test_utils.py @@ -31,6 +31,7 @@ def nonzero_ds(): @pytest.mark.parametrize("ds", [pytest.param(v, id=k) for k, v in datasets.items()]) @pytest.mark.parametrize("except_for", [None, "coords"]) def test_replace_arrays_with_zeros(ds, except_for): + # make sure doesn't error with range of datasets utils.replace_arrays_with_zeros(ds, except_for=except_for) From 9845be9b3b52187ae763303b2131cd3b50fbb0a3 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 9 Apr 2026 12:40:15 +0200 Subject: [PATCH 28/29] Update docs --- docs/development/posting-issues.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/development/posting-issues.md b/docs/development/posting-issues.md index 5116d8354..f46ab9166 100644 --- a/docs/development/posting-issues.md +++ b/docs/development/posting-issues.md @@ -73,6 +73,9 @@ ds_trimmed = replace_arrays_with_zeros(ds) # default: except_for=None with zarr.storage.ZipStore("my_dataset.zip", mode='w') as store: ds_trimmed.to_zarr(store) +size_mb_original = os.path.getsize("my_dataset.nc") / 1e6 +print(f"Original size: {size_mb_original:.1f} MB") + # Check the file size (aim for < 25 MB so it can be attached to a GitHub issue) size_mb = os.path.getsize("my_dataset.zip") / 1e6 print(f"Zip store size: {size_mb:.1f} MB") From f4c28764003765cf156ac5c94b41cfcc31c7fc6c Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 9 Apr 2026 12:42:48 +0200 Subject: [PATCH 29/29] Update docs according to review feedback --- docs/development/posting-issues.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/development/posting-issues.md b/docs/development/posting-issues.md index f46ab9166..7dc6c5c28 100644 --- a/docs/development/posting-issues.md +++ b/docs/development/posting-issues.md @@ -67,7 +67,8 @@ ds = xr.open_dataset("my_dataset.nc") # or xr.open_zarr(...), etc. # You can customise `except_for` to also retain actual values for specific variables: # except_for='coords' — keep coordinate arrays (useful for grid topology) # except_for=['lon', 'lat'] — keep a specific list of variables -ds_trimmed = replace_arrays_with_zeros(ds) # default: except_for=None +# except_for=None — remove all arrays (useful to know about dtypes, structure, and metadata). This is the default for the function. +ds_trimmed = replace_arrays_with_zeros(ds, except_for = None) # Save to a zipped Zarr store - replace `my_dataset` with a more informative name with zarr.storage.ZipStore("my_dataset.zip", mode='w') as store: