From cd494a048da07e61119b44cb40f74fbbdbb3f68c Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 19 Feb 2026 10:44:54 +0100 Subject: [PATCH 1/5] Avoid issue with deepcopying/pickling IntakeESGFDatasets --- esmvalcore/preprocessor/__init__.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index 522a63b3d3..a67613e6fa 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -69,6 +69,7 @@ extract_levels, extract_location, extract_point, + is_dataset, regrid, ) from esmvalcore.preprocessor._rolling_window import rolling_window_statistics @@ -619,7 +620,7 @@ def __init__( self, filename: Path, attributes: dict[str, Any] | None = None, - settings: dict[str, Any] | None = None, + settings: dict[str, dict[str, Any]] | None = None, datasets: list[Dataset] | None = None, ) -> None: if datasets is not None: @@ -644,6 +645,22 @@ def __init__( # Set some preprocessor settings (move all defaults here?) if settings is None: settings = {} + + # Create a copy of any datasets in settings. This drops the information + # in Dataset.files and avoids issues with deepcopying and pickling + # those files. This is needed because + # esmvalcore.io.intake_esgf.IntakeESGFDataset objects use a + # cached_requests.CachedSession object that cannot be deepcopied or + # pickled. + settings = { + fn: { + arg: ( + value.copy() if is_dataset(value) else copy.deepcopy(value) + ) + for arg, value in kwargs.items() + } + for fn, kwargs in settings.items() + } self.settings = copy.deepcopy(settings) if attributes is None: attributes = {} From 11574e6c8f46b128e0167732ce52d6a5ef92f197 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 19 Feb 2026 11:00:55 +0100 Subject: [PATCH 2/5] Nicer error message --- esmvalcore/io/intake_esgf.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/esmvalcore/io/intake_esgf.py b/esmvalcore/io/intake_esgf.py index 3f358f0532..1392fadacf 100644 --- a/esmvalcore/io/intake_esgf.py +++ b/esmvalcore/io/intake_esgf.py @@ -23,6 +23,7 @@ from __future__ import annotations import copy +import logging from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Any @@ -47,6 +48,8 @@ "IntakeESGFDataset", ] +logger = logging.getLogger(__name__) + class _CachingCatalog(intake_esgf.ESGFCatalog): """An ESGF catalog that caches to_path_dict results.""" @@ -122,6 +125,16 @@ def __hash__(self) -> int: def prepare(self) -> None: """Prepare the data for access.""" + try: + self.catalog.to_path_dict(minimal_keys=False, quiet=True) + except intake_esgf.exceptions.DatasetLoadError: + logger.error( + "Failed to download dataset '%s' from the ESGF. Error messages:\n%s", + self.name, + self.catalog.session_log(), + ) + raise + self.catalog.to_path_dict(minimal_keys=False) for index in self.catalog.indices: # Set the sessions to None to avoid issues with pickling From 718631b8d4154518487d37a31fe92ebd5ef76b6a Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 19 Feb 2026 11:10:16 +0100 Subject: [PATCH 3/5] Fix mypy issue --- esmvalcore/_recipe/recipe.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index 38f48fc663..626c3e05b5 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -520,7 +520,12 @@ def _update_multiproduct( if step == "ensemble_statistics": check.ensemble_statistics_preproc(settings) - grouping = ["project", "dataset", "exp", "sub_experiment"] + grouping: tuple[str, ...] | None = ( + "project", + "dataset", + "exp", + "sub_experiment", + ) else: check.multimodel_statistics_preproc(settings) grouping = settings.get("groupby", None) From 2216c23ba258187fade121099963e1db9f3078bb Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 19 Feb 2026 11:39:54 +0100 Subject: [PATCH 4/5] More informative error message --- esmvalcore/io/intake_esgf.py | 1 - tests/unit/io/test_intake_esgf.py | 25 ++++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/esmvalcore/io/intake_esgf.py b/esmvalcore/io/intake_esgf.py index 1392fadacf..2375958517 100644 --- a/esmvalcore/io/intake_esgf.py +++ b/esmvalcore/io/intake_esgf.py @@ -135,7 +135,6 @@ def prepare(self) -> None: ) raise - self.catalog.to_path_dict(minimal_keys=False) for index in self.catalog.indices: # Set the sessions to None to avoid issues with pickling # requests_cache.CachedSession objects when max_parallel_tasks > 1. diff --git a/tests/unit/io/test_intake_esgf.py b/tests/unit/io/test_intake_esgf.py index b24fc698b0..2b2d06acd7 100644 --- a/tests/unit/io/test_intake_esgf.py +++ b/tests/unit/io/test_intake_esgf.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING import intake_esgf +import intake_esgf.exceptions import iris.cube import pandas as pd import pytest @@ -35,7 +36,29 @@ def test_prepare(mocker: MockerFixture) -> None: dataset = IntakeESGFDataset(name="id", facets={}, catalog=cat) dataset.prepare() - to_path_mock.assert_called_once_with(minimal_keys=False) + to_path_mock.assert_called_once_with(minimal_keys=False, quiet=True) + + +def test_prepare_fails(mocker: MockerFixture) -> None: + """IntakeESGFDataset.prepare should should log catalog.session_log() on failure.""" + cat = intake_esgf.ESGFCatalog() + exc = intake_esgf.exceptions.DatasetLoadError( + ["CMCC.CMCC - CMS.historical.day.atmos.day.r1i1p1.sfcWind"], + None, + ) + to_path_mock = mocker.patch.object( + cat, + "to_path_dict", + autospec=True, + side_effect=exc, + ) + session_log_mock = mocker.patch.object(cat, "session_log", autospec=True) + dataset = IntakeESGFDataset(name="id", facets={}, catalog=cat) + + with pytest.raises(intake_esgf.exceptions.DatasetLoadError): + dataset.prepare() + to_path_mock.assert_called_once_with(minimal_keys=False, quiet=True) + session_log_mock.assert_called_once_with() def test_attributes_raises_before_to_iris() -> None: From 20087af1edee8f2aea59d71e34712eeae92ec758 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 19 Feb 2026 16:16:48 +0100 Subject: [PATCH 5/5] Add a test --- tests/integration/recipe/test_recipe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index 7799ba522f..699e012db6 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -971,6 +971,11 @@ def test_reference_dataset(tmp_path, patched_datafinder, session, monkeypatch): ) assert product.settings["regrid"]["target_grid"] == reference.datasets[0] + # Check that the target dataset does not have files, to prevent pickling + # errors: https://github.com/ESMValGroup/ESMValCore/issues/2989. + # The files can be found again at load time. + assert product.settings["regrid"]["target_grid"]._files is None + assert product.settings["extract_levels"]["levels"] == levels get_reference_levels.assert_called_once_with(reference.datasets[0])