Skip to content

Commit 21ed7c5

Browse files
committed
Persist Global Spot forecast dedupe state
1 parent bbf4603 commit 21ed7c5

3 files changed

Lines changed: 51 additions & 4 deletions

File tree

docs/research/new-publisher-source-planning/Met_Office_Global_Spot_Forecast_Publisher_Status_2026-05-26.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ Oracle live validation completed after host-local key installation:
9595
- A live Global Spot probe against `/sitespecific/v0/point/hourly` succeeded for London Heathrow Area with 49 candidate forecast records and recognized forecast temperature, humidity, wind speed, precipitation probability, and weather-code fields.
9696
- The Global Spot bootstrap resources already existed on OSH: 5 virtual forecast systems, 30 forecast datastreams, and the deployment hierarchy.
9797
- The first live publish attempt inserted 0 observations because OSH rejected an empty `leadTimeHours` decimal field. The publisher now preserves the field shape and uses OSH's supported `NaN` decimal sentinel when the upstream response lacks an issued/model-run time needed to compute lead time.
98+
- Before installing a persistent service, the publisher was updated to persist recently published forecast dedupe keys in `publishers/met_office_global_spot/state.json`, so service restarts do not repost the same forecast horizon.
9899
- After the fix, one live Global Spot `--once` cycle published 625 forecast observations with 0 errors and 0 skipped records.
99100
- CSAPI verification against datastream `06hg2` returned a live forecast observation with forecast type `Met Office Global Spot hourly deterministic forecast`, valid time `2026-05-26T19:00:00Z`, result time `2026-05-26T19:35:34Z`, air temperature `29.9`, and `leadTimeHours=NaN`.
100101
- Production Explorer reloaded to 905 map features after the Global Spot resources and observations were live. Selecting `Met Office Global Spot Portsmouth / Thorney Island Area` rendered a dedicated Forecast section and did not render Latest readings or Recent trend for forecast datastreams.
@@ -123,7 +124,7 @@ MET_OFFICE_DATAHUB_API_KEY_HEADER=apikey
123124
1. Commit and push the publisher `leadTimeHours` sentinel fix and focused parser tests.
124125
2. Commit and push the Explorer UI polish that hides unknown lead time values.
125126
3. Verify Cloudflare Pages production bundle after deployment and re-check the Global Spot Portsmouth / Thorney Island card.
126-
4. Install a persistent Oracle systemd service only after the deployed UI smoke check is clean.
127+
4. Seed the Oracle publisher state from the currently published forecast horizon, then install and start the persistent Oracle systemd service.
127128

128129
## Explorer Follow-Up
129130

publishers/met_office_global_spot/met_office_global_spot_publisher.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
DEFAULT_API_BASE = "https://data.hub.api.metoffice.gov.uk/sitespecific/v0"
3030
DEFAULT_HOURLY_PATH = "/point/hourly"
3131
FORECAST_TYPE = "Met Office Global Spot hourly deterministic forecast"
32+
STATE_PATH = Path(__file__).with_name("state.json")
33+
MAX_SEEN_KEYS = 5000
3234

3335

3436
class UpstreamRateLimit(RuntimeError):
@@ -298,6 +300,7 @@ def __init__(self, location_filter: list[str] | None = None):
298300
self.locations = [s for s in self.locations if s["id"] in wanted]
299301
self.parameters = _load_parameters()
300302
self.client = MetOfficeGlobalSpotClient()
303+
self.state = self._load_state()
301304

302305
self.osh_address = os.environ.get("OSH_ADDRESS", "")
303306
self.osh_port = int(os.environ.get("OSH_PORT", "443"))
@@ -318,13 +321,39 @@ def __init__(self, location_filter: list[str] | None = None):
318321
self._auth = "Basic " + base64.b64encode(
319322
f"{self.osh_user}:{self.osh_pass}".encode()).decode()
320323
self._ds_ids: dict[str, dict[str, str]] = {}
321-
self._seen: set[str] = set()
324+
self._seen: set[str] = set(self.state.get("publishedKeys", []))
322325
self._request_delay = float(os.environ.get("MET_OFFICE_GLOBAL_SPOT_REQUEST_DELAY", "1.0"))
323326
self._rate_limit_backoff = float(os.environ.get("MET_OFFICE_GLOBAL_SPOT_429_BACKOFF", "3600"))
324327
self._forecast_hours = float(os.environ.get("MET_OFFICE_GLOBAL_SPOT_FORECAST_HOURS", "24"))
325328
self._cooldown_until = 0.0
326329
self.stats = {"published": 0, "errors": 0, "reconnects": 0, "skipped": 0}
327330

331+
def _load_state(self) -> dict:
332+
if not STATE_PATH.exists():
333+
return {"publishedKeys": []}
334+
try:
335+
state = json.loads(STATE_PATH.read_text(encoding="utf-8"))
336+
except Exception:
337+
return {"publishedKeys": []}
338+
if not isinstance(state.get("publishedKeys"), list):
339+
state["publishedKeys"] = []
340+
return state
341+
342+
def _save_state(self):
343+
keys = list(dict.fromkeys(self.state.get("publishedKeys", [])))[-MAX_SEEN_KEYS:]
344+
self.state["publishedKeys"] = keys
345+
STATE_PATH.write_text(json.dumps(self.state, indent=2, sort_keys=True) + "\n", encoding="utf-8")
346+
347+
def _remember_seen(self, key: str, *, persist: bool):
348+
self._seen.add(key)
349+
if not persist:
350+
return
351+
keys = self.state.setdefault("publishedKeys", [])
352+
keys.append(key)
353+
if len(keys) > MAX_SEEN_KEYS:
354+
del keys[:-MAX_SEEN_KEYS]
355+
self._save_state()
356+
328357
def _system_uid(self, location_id: str) -> str:
329358
return f"urn:os4csapi:system:met-office-datahub-global-spot:{_uid_token(location_id)}:v1"
330359

@@ -534,13 +563,13 @@ def publish_cycle(self, dry_run: bool = False) -> int:
534563
value_label = f"{parameter['label']}={forecast['value']} {parameter['unit']} valid {forecast['phenomenonTime']}"
535564
if dry_run:
536565
print(f" [{ts_label}] {location_id}/{output_name}: [DRY] {value_label}")
537-
self._seen.add(forecast["dedupeKey"])
566+
self._remember_seen(forecast["dedupeKey"], persist=False)
538567
else:
539568
try:
540569
self._post_observation(ds_id, obs)
541570
self.stats["published"] += 1
542571
published += 1
543-
self._seen.add(forecast["dedupeKey"])
572+
self._remember_seen(forecast["dedupeKey"], persist=True)
544573
print(f" [{ts_label}] {location_id}/{output_name}: OK {value_label}")
545574
except Exception as exc:
546575
self.stats["errors"] += 1

tests/test_met_office_global_spot_parser.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from datetime import datetime, timedelta, timezone
22

3+
import publishers.met_office_global_spot.met_office_global_spot_publisher as global_spot
34
from publishers.met_office_global_spot.met_office_global_spot_publisher import (
45
MetOfficeGlobalSpotPublisher,
56
_candidate_records,
@@ -86,3 +87,19 @@ def hourly_forecast(self, _location):
8687

8788
assert forecasts[0]["result"]["issuedTime"] == ""
8889
assert forecasts[0]["result"]["leadTimeHours"] == "NaN"
90+
91+
92+
def test_seen_forecast_keys_persist_for_service_restarts(tmp_path, monkeypatch):
93+
monkeypatch.setattr(global_spot, "STATE_PATH", tmp_path / "state.json")
94+
95+
publisher = MetOfficeGlobalSpotPublisher.__new__(MetOfficeGlobalSpotPublisher)
96+
publisher.state = {"publishedKeys": []}
97+
publisher._seen = set()
98+
99+
publisher._remember_seen("location|air_temperature_forecast|valid|12.3", persist=True)
100+
101+
restarted = MetOfficeGlobalSpotPublisher.__new__(MetOfficeGlobalSpotPublisher)
102+
restarted.state = restarted._load_state()
103+
restarted._seen = set(restarted.state.get("publishedKeys", []))
104+
105+
assert "location|air_temperature_forecast|valid|12.3" in restarted._seen

0 commit comments

Comments
 (0)