From 628d2e6b04a587fb76fcf1c169e12b8c777570fc Mon Sep 17 00:00:00 2001 From: stacknil Date: Sat, 23 May 2026 17:56:33 +0800 Subject: [PATCH] fix(weather): harden diagnostic edge cases --- .../python-weather-diagnostics-toolkit-ci.yml | 3 +++ .../docs/calculation-methods.md | 13 ++++++---- .../docs/diagnostic-analysis.md | 5 ++++ .../dynamics.py | 15 ++++++++---- .../ensemble.py | 19 ++++++++++----- .../features.py | 16 +++++++++++-- .../tests/test_dynamics.py | 12 ++++++++++ .../tests/test_ensemble.py | 21 ++++++++++++++++ .../tests/test_features.py | 24 +++++++++++++++++++ 9 files changed, 112 insertions(+), 16 deletions(-) create mode 100644 projects/python-weather-diagnostics-toolkit/tests/test_ensemble.py diff --git a/.github/workflows/python-weather-diagnostics-toolkit-ci.yml b/.github/workflows/python-weather-diagnostics-toolkit-ci.yml index cd11c83..b8f204f 100644 --- a/.github/workflows/python-weather-diagnostics-toolkit-ci.yml +++ b/.github/workflows/python-weather-diagnostics-toolkit-ci.yml @@ -43,6 +43,9 @@ jobs: - name: Run tests run: python -m pytest + - name: Run lint + run: python -m ruff check . + - name: Compile modules and scripts run: python -m compileall src scripts diff --git a/projects/python-weather-diagnostics-toolkit/docs/calculation-methods.md b/projects/python-weather-diagnostics-toolkit/docs/calculation-methods.md index 513bb7b..9443c5f 100644 --- a/projects/python-weather-diagnostics-toolkit/docs/calculation-methods.md +++ b/projects/python-weather-diagnostics-toolkit/docs/calculation-methods.md @@ -146,7 +146,9 @@ area_mean = sum(field * weight) / sum(valid_weight) This avoids treating all latitude rows as equal-area rows. The implementation supports arrays with time or pressure dimensions before the final -latitude/longitude dimensions. +latitude/longitude dimensions. If a reduction window contains no finite values, +the result is `NaN`; this keeps missing-data coverage explicit instead of +converting an empty region into a numerical zero. ## Time-Ordered Baseline Model @@ -203,15 +205,18 @@ fixed seed. It then computes: ```text mean = ensemble mean -spread = ensemble standard deviation +spread = ensemble population standard deviation p10 = 10th percentile p90 = 90th percentile warm_probability = fraction(member >= 0.5) cold_probability = fraction(member <= -0.5) ``` -These calculations demonstrate ensemble-summary mechanics without embedding -real forecast products. +The summary requires finite member values. The spread uses the population +standard deviation (`ddof=0`), so a one-member reviewer fixture has spread `0` +instead of an undefined sample standard deviation. These calculations +demonstrate ensemble-summary mechanics without embedding real forecast +products. Example synthetic summary rows: diff --git a/projects/python-weather-diagnostics-toolkit/docs/diagnostic-analysis.md b/projects/python-weather-diagnostics-toolkit/docs/diagnostic-analysis.md index 291dcaa..9d1f0f0 100644 --- a/projects/python-weather-diagnostics-toolkit/docs/diagnostic-analysis.md +++ b/projects/python-weather-diagnostics-toolkit/docs/diagnostic-analysis.md @@ -142,6 +142,11 @@ members: - quantiles show an uncertainty envelope - threshold probabilities translate members into categorical risk-like summaries +The public implementation treats missing or infinite ensemble values as input +quality problems. Synthetic fixtures are deliberately complete, which makes +review outputs deterministic and keeps missing-data policy separate from +forecast interpretation. + Interpretation pattern: ```text diff --git a/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/dynamics.py b/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/dynamics.py index 23fdd11..fbd95cc 100644 --- a/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/dynamics.py +++ b/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/dynamics.py @@ -60,11 +60,18 @@ def relative_vorticity(u_wind, v_wind, latitude, longitude) -> np.ndarray: return dv_dx - du_dy +def _matching_field(name: str, field, expected_shape: tuple[int, int]) -> np.ndarray: + values = np.asarray(field, dtype=float) + if values.shape != expected_shape: + raise ValueError(f"{name} shape must match scalar field shape: {values.shape} != {expected_shape}") + return values + + def horizontal_advection(scalar, u_wind, v_wind, latitude, longitude) -> np.ndarray: """Compute horizontal advection, -(u dS/dx + v dS/dy).""" + scalar_values = np.asarray(scalar, dtype=float) dscalar_dy, dscalar_dx = gradient_on_latlon(scalar, latitude, longitude) - return -( - np.asarray(u_wind, dtype=float) * dscalar_dx - + np.asarray(v_wind, dtype=float) * dscalar_dy - ) + u_values = _matching_field("u_wind", u_wind, scalar_values.shape) + v_values = _matching_field("v_wind", v_wind, scalar_values.shape) + return -(u_values * dscalar_dx + v_values * dscalar_dy) diff --git a/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/ensemble.py b/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/ensemble.py index 3f3004f..cee210b 100644 --- a/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/ensemble.py +++ b/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/ensemble.py @@ -34,11 +34,18 @@ def make_synthetic_nino_ensemble( def ensemble_summary(df: pd.DataFrame) -> pd.DataFrame: """Summarize an ensemble by lead month.""" + if df.empty or df.shape[1] == 0: + raise ValueError("ensemble summary requires at least one member and one lead") + + values = df.astype(float) + if not np.isfinite(values.to_numpy()).all(): + raise ValueError("ensemble summary requires finite member values") + summary = pd.DataFrame(index=df.index) - summary["mean"] = df.mean(axis=1) - summary["spread"] = df.std(axis=1) - summary["p10"] = df.quantile(0.10, axis=1) - summary["p90"] = df.quantile(0.90, axis=1) - summary["warm_probability"] = (df >= 0.5).mean(axis=1) - summary["cold_probability"] = (df <= -0.5).mean(axis=1) + summary["mean"] = values.mean(axis=1) + summary["spread"] = values.std(axis=1, ddof=0) + summary["p10"] = values.quantile(0.10, axis=1) + summary["p90"] = values.quantile(0.90, axis=1) + summary["warm_probability"] = (values >= 0.5).mean(axis=1) + summary["cold_probability"] = (values <= -0.5).mean(axis=1) return summary diff --git a/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/features.py b/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/features.py index b4d3f19..0d329ec 100644 --- a/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/features.py +++ b/projects/python-weather-diagnostics-toolkit/src/python_weather_diagnostics_toolkit/features.py @@ -17,6 +17,8 @@ def area_mean(field, latitude) -> np.ndarray: """Area-weight a field over its final two latitude/longitude dimensions.""" values = np.asarray(field, dtype=float) + if values.ndim < 2: + raise ValueError("field must have at least latitude and longitude dimensions") weights = latitude_weights(latitude) if values.shape[-2] != weights.size: raise ValueError("latitude length must match the second-to-last field dimension") @@ -24,7 +26,9 @@ def area_mean(field, latitude) -> np.ndarray: weights_2d = weights.reshape(reshape) numerator = np.nansum(values * weights_2d, axis=(-2, -1)) denominator = np.nansum(np.isfinite(values) * weights_2d, axis=(-2, -1)) - return numerator / denominator + with np.errstate(divide="ignore", invalid="ignore"): + mean = numerator / denominator + return np.where(denominator > 0.0, mean, np.nan) def build_forecast_frame( @@ -66,8 +70,12 @@ def ridge_regression_fit_predict( raise ValueError("features must be a two-dimensional array") if y.ndim != 1 or y.size != x.shape[0]: raise ValueError("target must be one-dimensional and aligned with features") + if not np.isfinite(x).all() or not np.isfinite(y).all(): + raise ValueError("features and target must contain only finite values") if not 0.0 < train_fraction < 1.0: raise ValueError("train_fraction must be between 0 and 1") + if alpha < 0.0: + raise ValueError("alpha must be non-negative") split = int(x.shape[0] * train_fraction) if split <= 0 or split >= x.shape[0]: @@ -78,7 +86,7 @@ def ridge_regression_fit_predict( mean = x_train.mean(axis=0) scale = x_train.std(axis=0) - scale = np.where(scale == 0.0, 1.0, scale) + scale = np.where(np.isclose(scale, 0.0), 1.0, scale) x_train_scaled = (x_train - mean) / scale x_test_scaled = (x_test - mean) / scale @@ -106,6 +114,10 @@ def regression_metrics(y_true, y_pred) -> dict[str, float]: pred = np.asarray(y_pred, dtype=float) if true.shape != pred.shape: raise ValueError("y_true and y_pred must have matching shapes") + if true.size == 0: + raise ValueError("metrics require at least one sample") + if not np.isfinite(true).all() or not np.isfinite(pred).all(): + raise ValueError("metrics require finite values") err = pred - true if true.size < 2 or np.allclose(true.std(), 0.0) or np.allclose(pred.std(), 0.0): diff --git a/projects/python-weather-diagnostics-toolkit/tests/test_dynamics.py b/projects/python-weather-diagnostics-toolkit/tests/test_dynamics.py index 53c4b38..f563a8f 100644 --- a/projects/python-weather-diagnostics-toolkit/tests/test_dynamics.py +++ b/projects/python-weather-diagnostics-toolkit/tests/test_dynamics.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from python_weather_diagnostics_toolkit.dynamics import ( gradient_on_latlon, @@ -40,3 +41,14 @@ def test_zonal_gradient_masks_exact_pole_rows_without_infinity(): assert np.isnan(d_dx[0]).all() assert np.isnan(d_dx[-1]).all() assert np.isfinite(d_dx[1:-1]).all() + + +def test_horizontal_advection_rejects_mismatched_wind_shape(): + lat = np.linspace(30.0, 35.0, 5) + lon = np.linspace(100.0, 105.0, 6) + scalar = np.ones((lat.size, lon.size)) + u = np.ones((lat.size, lon.size - 1)) + v = np.ones_like(scalar) + + with pytest.raises(ValueError, match="u_wind shape"): + horizontal_advection(scalar, u, v, lat, lon) diff --git a/projects/python-weather-diagnostics-toolkit/tests/test_ensemble.py b/projects/python-weather-diagnostics-toolkit/tests/test_ensemble.py new file mode 100644 index 0000000..6bf2a12 --- /dev/null +++ b/projects/python-weather-diagnostics-toolkit/tests/test_ensemble.py @@ -0,0 +1,21 @@ +import numpy as np +import pandas as pd +import pytest + +from python_weather_diagnostics_toolkit.ensemble import ensemble_summary + + +def test_single_member_ensemble_has_zero_spread(): + df = pd.DataFrame({"member_01": [0.2, 0.8]}, index=[1, 2]) + + summary = ensemble_summary(df) + + np.testing.assert_allclose(summary["spread"].to_numpy(), np.array([0.0, 0.0])) + np.testing.assert_allclose(summary["warm_probability"].to_numpy(), np.array([0.0, 1.0])) + + +def test_ensemble_summary_rejects_non_finite_values(): + df = pd.DataFrame({"member_01": [0.2, np.nan]}, index=[1, 2]) + + with pytest.raises(ValueError, match="finite"): + ensemble_summary(df) diff --git a/projects/python-weather-diagnostics-toolkit/tests/test_features.py b/projects/python-weather-diagnostics-toolkit/tests/test_features.py index 1238554..8491f1c 100644 --- a/projects/python-weather-diagnostics-toolkit/tests/test_features.py +++ b/projects/python-weather-diagnostics-toolkit/tests/test_features.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from python_weather_diagnostics_toolkit.features import ( area_mean, @@ -16,6 +17,15 @@ def test_area_mean_preserves_constant_field(): np.testing.assert_allclose(mean, np.array([7.0, 7.0])) +def test_area_mean_returns_nan_for_all_missing_region(): + lat = np.array([30.0, 35.0, 40.0]) + field = np.full((lat.size, 4), np.nan) + + mean = area_mean(field, lat) + + assert np.isnan(mean) + + def test_ridge_baseline_returns_predictions_and_metrics(): x = np.column_stack([np.arange(20.0), np.arange(20.0) * 0.5]) y = 2.0 + x[:, 0] * 0.3 - x[:, 1] * 0.1 @@ -25,3 +35,17 @@ def test_ridge_baseline_returns_predictions_and_metrics(): assert result["y_pred"].shape == result["y_test"].shape assert metrics["rmse"] < 0.05 + + +def test_ridge_baseline_rejects_non_finite_training_values(): + x = np.column_stack([np.arange(20.0), np.arange(20.0) * 0.5]) + y = np.arange(20.0) + x[3, 0] = np.inf + + with pytest.raises(ValueError, match="finite"): + ridge_regression_fit_predict(x, y) + + +def test_regression_metrics_reject_empty_inputs(): + with pytest.raises(ValueError, match="at least one sample"): + regression_metrics(np.array([]), np.array([]))