From 23af5c84f21c2254a2553ec8729568dfe544e953 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 16:53:00 -0500 Subject: [PATCH 01/24] feat(waterdata): drop hash-valued ID columns by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The waterdata OGC services previously returned per-record UUID columns (``daily_id``, ``continuous_id``, ``peak_id``, …) plus secondary hash columns (``time_series_id``, ``parent_time_series_id``, ``field_visit_id``, ``field_measurements_series_id``) in every response. These IDs are unstable across record refreshes and not human-meaningful — stable identifiers like ``monitoring_location_id`` (AGENCY-ID format), ``parameter_code``, ``statistic_id`` and ``time`` are sufficient to pin a row. Drop the hash columns by default and add ``include_hash_ids: bool = False`` to every OGC ``get_*`` function for opt-in. Implementation trims server-side via the OGC ``properties=`` query parameter (cached per service from one queryables fetch) so the payload itself is smaller, with a client-side drop as a safety net. ``monitoring_location_id`` and other AGENCY-ID / code-style identifiers are unaffected. Offline benchmark on a synthetic 30,000-row payload (mirrors the on-wire shape and per-row size of a 1-year ``get_continuous`` query): - Server payload: 14,310,081 → 10,230,081 bytes (28.5% smaller) - DataFrame memory: 14.2 MB → 9.4 MB (33.5% smaller) - Peak traced memory: 94.1 MB → 73.9 MB (21.5% smaller) - Local parse + DataFrame construction: 1.05s → 0.94s (10.8% faster) Network savings stack on top of the local speedup. For very small queries (≲1k rows) the one-time queryables fetch overhead can dominate the savings; large queries are the target. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/_fixtures/.gitignore | 4 + benchmarks/_fixtures/synthesize.py | 123 +++++++++ benchmarks/bench_include_hash_ids.py | 356 +++++++++++++++++++++++++++ benchmarks/results_offline.txt | 22 ++ dataretrieval/waterdata/api.py | 154 ++++++++++++ dataretrieval/waterdata/utils.py | 170 ++++++++++++- tests/waterdata_test.py | 71 +++++- tests/waterdata_utils_test.py | 75 ++++++ 8 files changed, 952 insertions(+), 23 deletions(-) create mode 100644 benchmarks/_fixtures/.gitignore create mode 100644 benchmarks/_fixtures/synthesize.py create mode 100644 benchmarks/bench_include_hash_ids.py create mode 100644 benchmarks/results_offline.txt diff --git a/benchmarks/_fixtures/.gitignore b/benchmarks/_fixtures/.gitignore new file mode 100644 index 00000000..5efdd79b --- /dev/null +++ b/benchmarks/_fixtures/.gitignore @@ -0,0 +1,4 @@ +# Synthesized or captured response payloads are large (≈10–15 MB each) +# and trivially regenerated. Keep them out of the repo; the script that +# produces them is committed. +*.json diff --git a/benchmarks/_fixtures/synthesize.py b/benchmarks/_fixtures/synthesize.py new file mode 100644 index 00000000..72e07ffb --- /dev/null +++ b/benchmarks/_fixtures/synthesize.py @@ -0,0 +1,123 @@ +"""Generate synthetic OGC API payloads for the offline benchmark. + +We can't always reach the live USGS API (rate limits, no token), but +the local cost of an ``include_hash_ids=False`` vs ``True`` call is +dominated by: + - JSON parsing (``response.json()``) + - ``pandas.json_normalize`` over the features list + - DataFrame column allocation + +All three scale with payload bytes and feature count. A synthetic +payload that mirrors the real wire format and the real per-row column +shape is sufficient to measure them. +""" + +from __future__ import annotations + +import json +import uuid +from datetime import datetime, timedelta, timezone +from pathlib import Path + +ROWS = 30000 # ~1 year of 15-minute continuous data +HERE = Path(__file__).parent + + +def _row(i: int) -> dict: + ts = datetime(2024, 1, 1, tzinfo=timezone.utc) + timedelta(minutes=15 * i) + return { + "id": str(uuid.uuid4()), + "time_series_id": uuid.uuid4().hex, + "monitoring_location_id": "USGS-02238500", + "parameter_code": "00060", + "statistic_id": "00011", + "time": ts.strftime("%Y-%m-%dT%H:%M:%S+00:00"), + "value": f"{100.0 + 0.01 * (i % 1000):.2f}", + "unit_of_measure": "ft^3/s", + "approval_status": "Approved", + "qualifier": None, + "last_modified": "2026-05-01T00:00:00+00:00", + } + + +def _feature(props: dict) -> dict: + return { + "type": "Feature", + "properties": props, + "id": props.get("id", ""), + "geometry": None, + } + + +HASH_COLS = {"id", "time_series_id"} + + +def build_full() -> dict: + features = [_feature(_row(i)) for i in range(ROWS)] + return { + "type": "FeatureCollection", + "features": features, + "numberReturned": ROWS, + "links": [], + } + + +def build_trimmed() -> dict: + features = [] + for i in range(ROWS): + props = {k: v for k, v in _row(i).items() if k not in HASH_COLS} + features.append( + { + "type": "Feature", + "properties": props, + "id": "", + "geometry": None, + } + ) + return { + "type": "FeatureCollection", + "features": features, + "numberReturned": ROWS, + "links": [], + } + + +def build_queryables() -> dict: + return { + "properties": { + "geometry": {}, + "id": {}, + "time_series_id": {}, + "monitoring_location_id": {}, + "parameter_code": {}, + "statistic_id": {}, + "time": {}, + "value": {}, + "unit_of_measure": {}, + "approval_status": {}, + "qualifier": {}, + "last_modified": {}, + } + } + + +def main() -> None: + HERE.mkdir(exist_ok=True) + full = build_full() + trimmed = build_trimmed() + queryables = build_queryables() + + (HERE / "continuous_full.json").write_text(json.dumps(full)) + (HERE / "continuous_trimmed.json").write_text(json.dumps(trimmed)) + (HERE / "continuous_queryables.json").write_text(json.dumps(queryables)) + + full_size = (HERE / "continuous_full.json").stat().st_size + trim_size = (HERE / "continuous_trimmed.json").stat().st_size + pct = 100 * (full_size - trim_size) / full_size + print(f"rows: {ROWS:,}") + print(f"full: {full_size:>12,} bytes") + print(f"trimmed: {trim_size:>12,} bytes ({pct:.1f}% smaller)") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_include_hash_ids.py b/benchmarks/bench_include_hash_ids.py new file mode 100644 index 00000000..9dff5c7b --- /dev/null +++ b/benchmarks/bench_include_hash_ids.py @@ -0,0 +1,356 @@ +"""Benchmark: default (``include_hash_ids=False``) vs legacy +(``include_hash_ids=True``) on a large ``get_daily`` query. + +The two settings are functionally equivalent except for the presence of +UUID/hex-hash ID columns in the response. The hash columns are: + - ``daily_id`` — 36-char UUID, one per record + - ``time_series_id`` — 32-char hex hash, one per record + +For a 50,000-row response that's 68 bytes/row of hash, plus JSON +overhead — roughly 4 MB of payload that we now neither transfer nor +parse. The expected wins: + - smaller HTTP payload (server-side ``properties=`` trim) + - fewer columns to ``json_normalize`` in pandas + - smaller DataFrame footprint + +Run with:: + + API_USGS_PAT= python benchmarks/bench_include_hash_ids.py + +Without a token, USGS rate-limits to ~120 requests/hour which is enough +for a single comparison run but not for retrying. The script clears the +queryables cache between runs so the schema-fetch cost is amortized +across both configurations. +""" + +from __future__ import annotations + +import argparse +import gc +import json +import sys +import time +import tracemalloc +from dataclasses import dataclass +from pathlib import Path +from unittest import mock + +import requests + +from dataretrieval.waterdata import get_continuous +from dataretrieval.waterdata import utils as wd_utils + +# A long-running, high-frequency gage. Continuous (sub-hourly) records +# yield O(10⁴) rows per year per parameter — the time window below is +# tuned so a single query returns ~one page worth of data, large enough +# that JSON parsing/DataFrame construction dominates over network +# round-trip variability. +SITE = "USGS-02238500" +PARAMETER = "00060" +# ~1 year of 15-min flow data ≈ 35,000 rows, just under one page. +TIME_RANGE = "2023-01-01/2024-01-01" + +# Where to stash the captured payload for the offline benchmark mode. +FIXTURE_DIR = Path(__file__).parent / "_fixtures" +FIXTURE_FULL = FIXTURE_DIR / "continuous_full.json" +FIXTURE_TRIMMED = FIXTURE_DIR / "continuous_trimmed.json" +FIXTURE_QUERYABLES = FIXTURE_DIR / "continuous_queryables.json" + + +@dataclass +class RunResult: + label: str + wall_seconds: float + rows: int + cols: int + mem_peak_bytes: int + memory_usage_bytes: int + + def __str__(self) -> str: + return ( + f" {self.label:>32}: " + f"{self.wall_seconds:6.2f}s " + f"rows={self.rows:>7} " + f"cols={self.cols:>2} " + f"peak_mem={self.mem_peak_bytes / 1024 / 1024:6.1f} MB " + f"df_mem={self.memory_usage_bytes / 1024 / 1024:6.1f} MB" + ) + + +def time_call(label: str, **kwargs) -> RunResult: + """One end-to-end ``get_continuous`` call, with wall time, peak RSS + and final DataFrame memory captured.""" + # Reset the queryables cache so each configuration pays the same + # one-time schema-fetch cost (when it applies). + wd_utils._queryables_cache.clear() + gc.collect() + + tracemalloc.start() + start = time.perf_counter() + df, _md = get_continuous( + monitoring_location_id=SITE, + parameter_code=PARAMETER, + time=TIME_RANGE, + **kwargs, + ) + wall = time.perf_counter() - start + _current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + return RunResult( + label=label, + wall_seconds=wall, + rows=len(df), + cols=df.shape[1], + mem_peak_bytes=peak, + memory_usage_bytes=int(df.memory_usage(deep=True).sum()), + ) + + +def _make_mock_response(body_bytes: bytes, status: int = 200) -> mock.Mock: + """Build a ``requests.Response``-shaped mock backed by ``body_bytes``. + + Only the attributes ``_walk_pages`` and ``_get_resp_data`` touch + need to be real; ``elapsed``, ``status_code``, ``headers``, ``json()``, + and ``raise_for_status()`` cover those callers. Pagination is + forced single-page by stripping any "next" link from the body + before the test patches it in. + """ + resp = mock.Mock(spec=requests.Response) + resp.status_code = status + resp.headers = {"x-ratelimit-remaining": "1000"} + resp.elapsed = __import__("datetime").timedelta(milliseconds=1) + resp.url = "https://test/mock" + resp.text = body_bytes.decode("utf-8") + resp.json = lambda: json.loads(body_bytes) + resp.raise_for_status = lambda: None + return resp + + +def capture_fixtures() -> None: + """Snapshot the two response payloads (with and without hash IDs) + plus the queryables response, so the ``--offline`` benchmark can + parse them locally with no network calls.""" + FIXTURE_DIR.mkdir(exist_ok=True) + session = requests.Session() + headers = wd_utils._default_headers() + + # Queryables (used only by the default code path). + q_url = f"{wd_utils.OGC_API_URL}/collections/continuous/queryables" + print(f"Fetching {q_url}") + r = session.get(q_url, headers=headers) + r.raise_for_status() + FIXTURE_QUERYABLES.write_bytes(r.content) + + body = json.loads(r.content) + all_props = list(body.get("properties", {}).keys()) + non_hash = [ + p + for p in all_props + if p not in wd_utils._HASH_ID_COLUMNS and p != "geometry" and p != "id" + ] + + base = f"{wd_utils.OGC_API_URL}/collections/continuous/items" + common = { + "monitoring_location_id": SITE, + "parameter_code": PARAMETER, + "time": TIME_RANGE, + "skipGeometry": True, + "limit": 50000, + } + + # Full payload (legacy behavior — every column). + print("Fetching full payload …") + r = session.get(base, headers=headers, params=common) + r.raise_for_status() + FIXTURE_FULL.write_bytes(r.content) + print(f" → {FIXTURE_FULL.name}: {len(r.content):,} bytes") + + # Trimmed payload (new default — non-hash columns only). + print("Fetching trimmed payload …") + params = dict(common, properties=",".join(non_hash)) + r = session.get(base, headers=headers, params=params) + r.raise_for_status() + FIXTURE_TRIMMED.write_bytes(r.content) + print(f" → {FIXTURE_TRIMMED.name}: {len(r.content):,} bytes") + + full_size = FIXTURE_FULL.stat().st_size + trim_size = FIXTURE_TRIMMED.stat().st_size + pct = 100 * (full_size - trim_size) / full_size + print() + print( + f"Server payload size: {full_size:,} → {trim_size:,} bytes ({pct:.1f}% smaller)" + ) + + +def time_offline(label: str, payload_path: Path, include_hash_ids: bool) -> RunResult: + """Measure parsing + DataFrame construction time on a captured + payload. ``client.send`` is patched to return the recorded + response, so this isolates the local-CPU portion of a call from + network variability and rate-limit pressure.""" + body = payload_path.read_bytes() + queryables_body = FIXTURE_QUERYABLES.read_bytes() + wd_utils._queryables_cache.clear() + gc.collect() + + def _send(req, *args, **kwargs): + return _make_mock_response(body) + + def _get(url, *args, **kwargs): + return _make_mock_response(queryables_body) + + with ( + mock.patch.object(requests.Session, "send", _send), + mock.patch.object(requests, "get", _get), + ): + tracemalloc.start() + start = time.perf_counter() + df, _md = get_continuous( + monitoring_location_id=SITE, + parameter_code=PARAMETER, + time=TIME_RANGE, + include_hash_ids=include_hash_ids, + ) + wall = time.perf_counter() - start + _current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + return RunResult( + label=label, + wall_seconds=wall, + rows=len(df), + cols=df.shape[1], + mem_peak_bytes=peak, + memory_usage_bytes=int(df.memory_usage(deep=True).sum()), + ) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--capture", + action="store_true", + help="Fetch and save the two response payloads as fixtures, then exit.", + ) + parser.add_argument( + "--offline", + action="store_true", + help="Use captured fixtures instead of hitting the live API. " + "Isolates parsing/DataFrame cost from network variability.", + ) + parser.add_argument( + "--rounds", + type=int, + default=3, + help="Number of measured rounds per configuration (default: 3).", + ) + args = parser.parse_args() + + if args.capture: + capture_fixtures() + return 0 + + if args.offline: + if not ( + FIXTURE_FULL.exists() + and FIXTURE_TRIMMED.exists() + and FIXTURE_QUERYABLES.exists() + ): + print( + "Missing fixtures. Run with --capture first to record them.", + file=sys.stderr, + ) + return 1 + + full_size = FIXTURE_FULL.stat().st_size + trim_size = FIXTURE_TRIMMED.stat().st_size + print( + f"Offline benchmark on fixtures " + f"(full: {full_size:,} B, trimmed: {trim_size:,} B, " + f"server-side savings: {100 * (full_size - trim_size) / full_size:.1f}%)" + ) + print() + + # Warm up to load pandas/geopandas/numpy code paths. + time_offline("warmup_default", FIXTURE_TRIMMED, include_hash_ids=False) + time_offline("warmup_legacy", FIXTURE_FULL, include_hash_ids=True) + + runs = [] + for _ in range(args.rounds): + runs.append( + time_offline( + "default (hash IDs dropped)", + FIXTURE_TRIMMED, + include_hash_ids=False, + ) + ) + runs.append( + time_offline( + "include_hash_ids=True", + FIXTURE_FULL, + include_hash_ids=True, + ) + ) + else: + print( + f"Benchmarking get_continuous(site={SITE!r}, parameter={PARAMETER!r}, " + f"time={TIME_RANGE!r})" + ) + print() + + # Warmup once with each configuration to absorb DNS/TLS/cache + # cold-start effects, then run measured rounds. + print("Warming up …") + time_call("warmup_default") + time_call("warmup_legacy", include_hash_ids=True) + + runs = [] + for _ in range(args.rounds): + runs.append(time_call("default (hash IDs dropped)")) + runs.append(time_call("include_hash_ids=True", include_hash_ids=True)) + + print("All runs:") + for r in runs: + print(r) + print() + + best_default = min( + (r for r in runs if r.label.startswith("default")), + key=lambda r: r.wall_seconds, + ) + best_legacy = min( + (r for r in runs if r.label.startswith("include_hash_ids")), + key=lambda r: r.wall_seconds, + ) + + print("Best of each:") + print(best_default) + print(best_legacy) + print() + + wall_delta = best_legacy.wall_seconds - best_default.wall_seconds + wall_pct = ( + 100 * wall_delta / best_legacy.wall_seconds if best_legacy.wall_seconds else 0.0 + ) + mem_delta = best_legacy.memory_usage_bytes - best_default.memory_usage_bytes + mem_pct = ( + 100 * mem_delta / best_legacy.memory_usage_bytes + if best_legacy.memory_usage_bytes + else 0.0 + ) + peak_delta = best_legacy.mem_peak_bytes - best_default.mem_peak_bytes + peak_pct = ( + 100 * peak_delta / best_legacy.mem_peak_bytes + if best_legacy.mem_peak_bytes + else 0.0 + ) + + print(f"Wall-clock speedup: {wall_delta:+.2f}s ({wall_pct:+.1f}%)") + print(f"DataFrame memory: {mem_delta / 1024 / 1024:+.1f} MB ({mem_pct:+.1f}%)") + print(f"Peak traced memory: {peak_delta / 1024 / 1024:+.1f} MB ({peak_pct:+.1f}%)") + print(f"Columns dropped: {best_legacy.cols - best_default.cols}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/benchmarks/results_offline.txt b/benchmarks/results_offline.txt new file mode 100644 index 00000000..e7e10919 --- /dev/null +++ b/benchmarks/results_offline.txt @@ -0,0 +1,22 @@ +Offline benchmark on fixtures (full: 14,310,081 B, trimmed: 10,230,081 B, server-side savings: 28.5%) + +All runs: + default (hash IDs dropped): 0.96s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.09s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.08s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.06s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + default (hash IDs dropped): 0.97s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.09s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + default (hash IDs dropped): 0.97s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.05s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + +Best of each: + default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB + include_hash_ids=True: 1.05s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB + +Wall-clock speedup: +0.11s (+10.8%) +DataFrame memory: +4.7 MB (+33.5%) +Peak traced memory: +20.2 MB (+21.5%) +Columns dropped: 2 diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 1ec9ed42..f586d229 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -61,6 +61,7 @@ def get_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -193,6 +194,19 @@ def get_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -276,6 +290,7 @@ def get_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Continuous data provide instantaneous water conditions. @@ -403,6 +418,19 @@ def get_continuous( convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -496,6 +524,7 @@ def get_monitoring_locations( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Location information is basic information about the monitoring location including the name, identifier, agency responsible for data collection, and @@ -711,6 +740,19 @@ def get_monitoring_locations( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -774,6 +816,7 @@ def get_time_series_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data and continuous measurements are grouped into time series, which represent a collection of observations of a single parameter, @@ -934,6 +977,19 @@ def get_time_series_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1031,6 +1087,7 @@ def get_combined_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get combined monitoring-location and time-series metadata. @@ -1131,6 +1188,19 @@ def get_combined_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1219,6 +1289,7 @@ def get_latest_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """This endpoint provides the most recent observation for each time series of continuous data. Continuous data are collected via automated sensors @@ -1348,6 +1419,19 @@ def get_latest_continuous( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1414,6 +1498,7 @@ def get_latest_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -1545,6 +1630,19 @@ def get_latest_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1612,6 +1710,7 @@ def get_field_measurements( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Field measurements are physically measured values collected during a visit to the monitoring location. Field measurements consist of measurements @@ -1733,6 +1832,19 @@ def get_field_measurements( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1796,6 +1908,7 @@ def get_field_measurements_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get field-measurement metadata: one row per (location, parameter) series. @@ -1851,6 +1964,19 @@ def get_field_measurements_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -1917,6 +2043,7 @@ def get_peaks( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get the annual peak streamflow / stage record for a monitoring location. @@ -1975,6 +2102,19 @@ def get_peaks( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- @@ -2718,6 +2858,7 @@ def get_channel( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Channel measurements taken as part of streamflow field measurements. @@ -2832,6 +2973,19 @@ def get_channel( convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector + include_hash_ids : boolean, optional + If False (default), hash-valued ID columns (the per-record UUID + used as the row's primary key, plus secondary hash columns such + as ``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, and ``field_measurements_series_id``) are + omitted from the response. These IDs are not stable across + record refreshes and are not human-meaningful; dropping them + also shrinks the server payload for large queries. Stable, + human-meaningful identifiers like ``monitoring_location_id``, + ``parameter_code``, and ``statistic_id`` are always returned. + Set to True to restore the pre-existing behavior of including + every column. Listing a hash column explicitly in + ``properties`` also overrides this default for that column. Returns ------- diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 66ed1723..7994f07e 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -181,6 +181,91 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # parameters and require POST with CQL2 JSON instead. _CQL2_REQUIRED_SERVICES = frozenset({"monitoring-locations"}) +# Column names whose values are server-generated hashes (UUIDs or hex +# digests). These are unstable across record refreshes — joining or +# diffing on them produces spurious churn — and they bloat the payload +# of large queries. Dropped by default; opt in with +# ``include_hash_ids=True``. Includes both: +# - The per-record version UUIDs that are aliased to a service's +# ``output_id`` (``daily_id``, ``continuous_id``, …). These get +# mapped to/from ``"id"`` on the wire; both names are listed so the +# filter works on either side of ``_switch_properties_id``. +# - Secondary hash columns embedded in record payloads +# (``time_series_id``, ``field_visit_id``, ``parent_time_series_id``, +# ``field_measurements_series_id``). +# ``monitoring_location_id`` (AGENCY-ID format, e.g. ``USGS-01646500``) +# and other code columns (``parameter_code``, ``statistic_id``, …) are +# intentionally absent — they're stable, human-meaningful identifiers. +_HASH_ID_COLUMNS = frozenset( + { + "daily_id", + "continuous_id", + "latest_continuous_id", + "latest_daily_id", + "field_measurement_id", + "field_series_id", + "peak_id", + "channel_measurements_id", + "combined_meta_id", + "time_series_id", + "parent_time_series_id", + "field_visit_id", + "field_measurements_series_id", + } +) + +# Cache of per-service queryables column lists, populated on first call +# from each service when computing the default ``properties=`` for +# ``include_hash_ids=False``. Keyed by service name; value is the full +# list of property names the server exposes for that collection. +_queryables_cache: dict[str, list[str]] = {} + + +def _service_queryables(service: str) -> list[str]: + """Return the cached queryables property list for ``service``. + + One HTTP GET per service per process; the list is reused for every + subsequent call. Raises ``requests.HTTPError`` on a non-200 — the + caller's ``include_hash_ids=False`` request can't be satisfied + without it, so failing loudly is preferable to silently dropping + the server-side trim. + """ + cached = _queryables_cache.get(service) + if cached is not None: + return cached + body = _check_ogc_requests(endpoint=service, req_type="queryables") + props = list(body.get("properties", {}).keys()) + _queryables_cache[service] = props + return props + + +def _default_non_hash_properties(service: str, output_id: str) -> list[str]: + """Build the ``properties=`` whitelist sent to the server when the + caller didn't supply one and ``include_hash_ids=False``. + + Returns the service's queryables minus: + - any column whose wire-format name is in :data:`_HASH_ID_COLUMNS` + (the secondary hashes like ``time_series_id``, + ``parent_time_series_id``, ``field_visit_id``); + - the wire-format ``"id"`` column, but only when the service's + ``output_id`` (its post-rename name) is itself a hash column — + i.e., for ``daily``/``continuous``/``peaks``/etc., where ``id`` + becomes ``daily_id``/``continuous_id``/``peak_id``. For + ``monitoring-locations`` (where ``id`` becomes the AGENCY-ID + ``monitoring_location_id``) the ``id`` column is kept; + - ``"geometry"`` (the OGC server returns geometry via the feature + envelope, not as a property — listing it would be redundant and + some collections reject it). + """ + drop_wire_id = output_id in _HASH_ID_COLUMNS + return [ + p + for p in _service_queryables(service) + if p not in _HASH_ID_COLUMNS + and p != "geometry" + and not (drop_wire_id and p == "id") + ] + def _parse_datetime(value: str) -> datetime | None: """Parse a single datetime string against the supported formats. @@ -1104,7 +1189,10 @@ def _deal_with_empty( def _arrange_cols( - df: pd.DataFrame, properties: list[str] | None, output_id: str + df: pd.DataFrame, + properties: list[str] | None, + output_id: str, + include_hash_ids: bool = False, ) -> pd.DataFrame: """ Rearranges and renames columns in a DataFrame based on provided @@ -1119,6 +1207,13 @@ def _arrange_cols( only NaN, the function renames 'id' to output_id. output_id : str The name to which the 'id' column should be renamed if applicable. + include_hash_ids : bool, optional + If False (default), hash-valued ID columns (see + :data:`_HASH_ID_COLUMNS`) are dropped from the result unless the + caller explicitly named them in ``properties``. If True, the + legacy behavior is preserved: hash columns are kept and the + per-record output_id columns are moved to the end of the + DataFrame when ``properties`` is unspecified. Returns ------- @@ -1141,8 +1236,38 @@ def _arrange_cols( local_properties[local_properties.index("id")] = output_id df = df.loc[:, [col for col in local_properties if col in df.columns]] - # Move meaningless-to-user, extra id columns to the end - # of the dataframe, if they exist + # Default: drop hash-valued ID columns the caller didn't ask for. + # This is the client-side counterpart to the server-side + # ``properties=`` trim done in ``get_ogc_data``; it's a no-op on + # the happy path (the server already omitted them) but catches the + # fallback case where the queryables fetch failed and we + # round-tripped a full payload. Drops apply uniformly even when the + # hash column is the service's renamed ``output_id`` (e.g., + # ``daily_id``) — the user-meaningful identifiers + # (``monitoring_location_id`` + ``time`` + ``parameter_code`` + + # ``statistic_id``) are sufficient to pin a row. + if not include_hash_ids: + requested = ( + set(properties) if properties and not all(pd.isna(properties)) else set() + ) + # ``"id"`` in ``properties`` resolves to the renamed ``output_id`` + # (matching the rename done above and in ``_switch_properties_id``), + # so treat the user as having asked for that output_id too. + if "id" in requested: + requested.add(output_id) + drop_cols = [ + col + for col in df.columns + if col in _HASH_ID_COLUMNS and col not in requested + ] + if drop_cols: + df = df.drop(columns=drop_cols) + + # Legacy ordering: when ``include_hash_ids=True`` and ``properties`` + # is unspecified, move the per-record version IDs to the end of the + # DataFrame so they don't crowd the front. With + # ``include_hash_ids=False`` those columns are gone above, so this + # branch becomes a no-op. extra_id_col = set(df.columns).intersection( { "latest_continuous_id", @@ -1153,9 +1278,6 @@ def _arrange_cols( } ) - # If the arbitrary id column is returned (either due to properties - # being none or NaN), then move it to the end of the dataframe, but - # if part of properties, keep in requested order if extra_id_col and (properties is None or all(pd.isna(properties))): id_col_order = [col for col in df.columns if col not in extra_id_col] + list( extra_id_col @@ -1273,10 +1395,36 @@ def get_ogc_data( # Capture `properties` before the id-switch so post-processing sees # the user-facing names, not the wire-format ones. properties = args.get("properties") - args["properties"] = _switch_properties_id( - properties, id_name=output_id, service=service - ) convert_type = args.pop("convert_type", False) + include_hash_ids = args.pop("include_hash_ids", False) + + # When the caller didn't pin ``properties`` and isn't opting into + # hash IDs, send a server-side whitelist of the non-hash columns so + # the server (a) skips serializing UUID/hex fields and (b) returns + # a smaller payload for us to parse. ``_arrange_cols`` still sees + # ``properties=None`` (the original user input), so columns retain + # the schema's natural order rather than being subset to whatever + # whitelist we synthesized here. + if not include_hash_ids and (properties is None or all(pd.isna(properties))): + try: + args["properties"] = _default_non_hash_properties(service, output_id) + except (requests.HTTPError, requests.RequestException, ValueError) as exc: + # Server-side trim is an optimization, not a correctness + # requirement — fall back to a full payload and rely on the + # ``_arrange_cols`` post-processing drop below. + logger.warning( + "Could not fetch queryables for %s (%s); " + "falling back to client-side hash-ID drop.", + service, + exc, + ) + args["properties"] = _switch_properties_id( + properties, id_name=output_id, service=service + ) + else: + args["properties"] = _switch_properties_id( + properties, id_name=output_id, service=service + ) args = {k: v for k, v in args.items() if v is not None} with _progress.progress_context(service=service): @@ -1284,7 +1432,9 @@ def get_ogc_data( return_list = _deal_with_empty(return_list, properties, service) if convert_type: return_list = _type_cols(return_list) - return_list = _arrange_cols(return_list, properties, output_id) + return_list = _arrange_cols( + return_list, properties, output_id, include_hash_ids=include_hash_ids + ) return_list = _sort_rows(return_list) return return_list, BaseMetadata(response) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 09f66aa5..db94a819 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -280,10 +280,14 @@ def test_get_daily(): parameter_code="00060", time="2025-01-01/..", ) - assert "daily_id" in df.columns + # Default: hash-valued ID columns (daily_id, time_series_id) are + # dropped. Stable identifiers (monitoring_location_id, + # parameter_code, statistic_id, time) are preserved. + assert "daily_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "geometry" in df.columns - assert df.columns[-1] == "daily_id" - assert df.shape[1] == 12 + assert df.shape[1] == 10 assert df.parameter_code.unique().tolist() == ["00060"] assert df.monitoring_location_id.unique().tolist() == ["USGS-05427718"] assert df["time"].apply(lambda x: isinstance(x, datetime.date)).all() @@ -293,6 +297,22 @@ def test_get_daily(): assert df["value"].dtype == "float64" +def test_get_daily_include_hash_ids(): + """``include_hash_ids=True`` restores the legacy behavior: the + per-record UUID (``daily_id``) and secondary hashes + (``time_series_id``) are present.""" + df, _ = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/..", + include_hash_ids=True, + ) + assert "daily_id" in df.columns + assert "time_series_id" in df.columns + assert df.columns[-1] == "daily_id" + assert df.shape[1] == 12 + + def test_get_daily_properties(): df, _ = get_daily( monitoring_location_id="USGS-05427718", @@ -338,7 +358,8 @@ def test_get_daily_no_geometry(): skip_geometry=True, ) assert "geometry" not in df.columns - assert df.shape[1] == 11 + # 10 default cols minus geometry, with hash IDs dropped by default. + assert df.shape[1] == 9 assert isinstance(df, DataFrame) @@ -354,7 +375,11 @@ def test_get_continuous(): df["time"].dtype.name.startswith("datetime64[") and "UTC" in df["time"].dtype.name ) - assert "continuous_id" in df.columns + # Default: continuous_id (UUID) and time_series_id (hex hash) are + # dropped. Set ``include_hash_ids=True`` to keep them. + assert "continuous_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns def test_get_monitoring_locations(): @@ -379,7 +404,10 @@ def test_get_latest_continuous(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"], ) - assert df.columns[-1] == "latest_continuous_id" + # Default: latest_continuous_id (UUID) and time_series_id are dropped. + assert "latest_continuous_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert df.shape[0] <= 4 assert df.statistic_id.unique().tolist() == ["00011"] assert hasattr(md, "url") @@ -394,8 +422,11 @@ def test_get_latest_daily(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"], ) - assert "latest_daily_id" in df.columns - assert df.shape[1] == 12 + # Default: latest_daily_id (UUID) and time_series_id are dropped. + assert "latest_daily_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns + assert df.shape[1] == 10 assert hasattr(md, "url") assert hasattr(md, "query_time") @@ -423,7 +454,12 @@ def test_get_field_measurements(): time="2025-01-01/2025-10-01", skip_geometry=True, ) - assert "field_measurement_id" in df.columns + # Default: field_measurement_id (UUID), field_measurements_series_id + # (UUID), and field_visit_id (UUID) are dropped. + assert "field_measurement_id" not in df.columns + assert "field_measurements_series_id" not in df.columns + assert "field_visit_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "geometry" not in df.columns assert df.unit_of_measure.unique().tolist() == ["ft^3/s"] assert hasattr(md, "url") @@ -481,7 +517,9 @@ def test_get_field_measurements_metadata(): df, md = get_field_measurements_metadata( monitoring_location_id="USGS-02238500", skip_geometry=True ) - assert "field_series_id" in df.columns + # Default: field_series_id (UUID) is dropped. + assert "field_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "begin" in df.columns assert "end" in df.columns assert (df["monitoring_location_id"] == "USGS-02238500").all() @@ -509,7 +547,10 @@ def test_get_field_measurements_metadata_multi_site(): def test_get_peaks(): df, md = get_peaks(monitoring_location_id="USGS-02238500", skip_geometry=True) - assert "peak_id" in df.columns + # Default: peak_id (UUID) and time_series_id are dropped. + assert "peak_id" not in df.columns + assert "time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns assert "value" in df.columns assert "water_year" in df.columns assert (df["monitoring_location_id"] == "USGS-02238500").all() @@ -607,8 +648,12 @@ def test_get_channel(): df, _ = get_channel(monitoring_location_id="USGS-02238500") assert df.shape[0] > 470 - assert df.shape[1] == 27 # if geopandas installed, 21 columns if not - assert "channel_measurements_id" in df.columns + # Default: channel_measurements_id (UUID) and field_visit_id (UUID) + # are dropped. 27 → 25 cols. + assert df.shape[1] == 25 # if geopandas installed, fewer if not + assert "channel_measurements_id" not in df.columns + assert "field_visit_id" not in df.columns + assert "monitoring_location_id" in df.columns class TestCheckMonitoringLocationId: diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index bb5ece10..aa38a447 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -531,6 +531,81 @@ def test_arrange_cols_keeps_geometry_when_present(): assert "geometry" in result.columns +def test_arrange_cols_drops_hash_ids_by_default(): + """Default ``include_hash_ids=False`` drops the per-record UUID + (renamed to ``daily_id``) and secondary hash columns + (``time_series_id``), keeping stable identifiers.""" + df = pd.DataFrame( + { + "id": ["uuid-a"], + "time_series_id": ["hex-1"], + "monitoring_location_id": ["USGS-01"], + "value": [1.0], + } + ) + result = _arrange_cols(df, properties=None, output_id="daily_id") + assert "daily_id" not in result.columns + assert "time_series_id" not in result.columns + assert "monitoring_location_id" in result.columns + assert "value" in result.columns + + +def test_arrange_cols_include_hash_ids_keeps_them(): + """``include_hash_ids=True`` preserves the legacy behavior — hash + columns are kept and the per-record UUID lands at the end of the + column order.""" + df = pd.DataFrame( + { + "id": ["uuid-a"], + "time_series_id": ["hex-1"], + "monitoring_location_id": ["USGS-01"], + "value": [1.0], + } + ) + result = _arrange_cols( + df, properties=None, output_id="daily_id", include_hash_ids=True + ) + assert "daily_id" in result.columns + assert "time_series_id" in result.columns + # Legacy ordering: ``daily_id`` moves to the end. + assert result.columns[-1] == "daily_id" + + +def test_arrange_cols_explicit_properties_keep_hash_ids(): + """A user who lists a hash column in ``properties`` gets it back even + with the default ``include_hash_ids=False`` — explicit beats default.""" + df = pd.DataFrame( + { + "id": ["uuid-a"], + "time_series_id": ["hex-1"], + "monitoring_location_id": ["USGS-01"], + "value": [1.0], + } + ) + result = _arrange_cols( + df, + properties=["daily_id", "time_series_id", "value"], + output_id="daily_id", + ) + assert "daily_id" in result.columns + assert "time_series_id" in result.columns + + +def test_arrange_cols_non_hash_output_id_kept(): + """``monitoring_location_id`` (the output_id for monitoring-locations) + is NOT a hash — the AGENCY-ID format is stable and human-meaningful — + so it must stay even under the default.""" + df = pd.DataFrame( + { + "id": ["USGS-01"], + "agency_code": ["USGS"], + } + ) + result = _arrange_cols(df, properties=None, output_id="monitoring_location_id") + assert "monitoring_location_id" in result.columns + assert result.loc[0, "monitoring_location_id"] == "USGS-01" + + # --- _format_api_dates ------------------------------------------------------- From 9c625f3922caab0429d64a1d514d8d44cf37fb85 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 17:19:44 -0500 Subject: [PATCH 02/24] feat(waterdata): extend hash-ID drop to get_stats_por / get_stats_date_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OGC ``get_*`` functions in the prior commit drop hash columns through ``get_ogc_data``. The statistics services (which return JSON through ``get_stats_data`` rather than OGC features) bypassed that path, so ``get_stats_por`` and ``get_stats_date_range`` were still returning ``computation_id`` (UUID) and ``parent_time_series_id`` (hex hash) by default. This commit: - Adds ``computation_id`` to ``_HASH_ID_COLUMNS`` (``parent_time_series_id`` was already there). - Plumbs ``include_hash_ids: bool = False`` through ``get_stats_data``, ``get_stats_por``, and ``get_stats_date_range``. - Drops the hash columns at the end of ``get_stats_data``, after ``_expand_percentiles`` (which still needs ``computation_id`` as a join key while it explodes the percentile lists into rows). - Updates ``test_get_stats_por_expanded_false`` / ``test_get_stats_date_range`` to reflect the new column count and adds ``test_get_stats_por_include_hash_ids`` documenting the opt-in. Discovered while running a live-API sweep across every public waterdata ``get_*`` function — the OGC services now pass, the stats ones used to leak, and this commit closes that gap. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 26 +++++++++++++++++++++++--- dataretrieval/waterdata/utils.py | 22 ++++++++++++++++++++++ tests/waterdata_test.py | 25 +++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index f586d229..4bfa42b3 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -2586,6 +2586,7 @@ def get_stats_por( site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, expand_percentiles: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get day-of-year and month-of-year water data statistics from the USGS Water Data API. @@ -2664,6 +2665,13 @@ def get_stats_por( argument will return both the "values" column, containing the list of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. + include_hash_ids : boolean, optional + If False (default), the per-computation UUID (``computation_id``) + and the upstream time-series hex hash (``parent_time_series_id``) + are dropped from the returned DataFrame. Stable identifiers + (``monitoring_location_id``, ``parameter_code``, the time keys) + are kept. Set to True to restore the legacy behavior of + including every column. Examples -------- @@ -2688,10 +2696,13 @@ def get_stats_por( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles"}) + params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"}) return get_stats_data( - args=params, service="observationNormals", expand_percentiles=expand_percentiles + args=params, + service="observationNormals", + expand_percentiles=expand_percentiles, + include_hash_ids=include_hash_ids, ) @@ -2710,6 +2721,7 @@ def get_stats_date_range( site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, expand_percentiles: bool = True, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get monthly and annual water data statistics from the USGS Water Data API. This service (called the "observationIntervals" endpoint on api.waterdata.usgs.gov) @@ -2792,6 +2804,13 @@ def get_stats_date_range( argument will return both the "values" column, containing the list of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. + include_hash_ids : boolean, optional + If False (default), the per-computation UUID (``computation_id``) + and the upstream time-series hex hash (``parent_time_series_id``) + are dropped from the returned DataFrame. Stable identifiers + (``monitoring_location_id``, ``parameter_code``, the time keys) + are kept. Set to True to restore the legacy behavior of + including every column. Examples -------- @@ -2817,12 +2836,13 @@ def get_stats_date_range( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles"}) + params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"}) return get_stats_data( args=params, service="observationIntervals", expand_percentiles=expand_percentiles, + include_hash_ids=include_hash_ids, ) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 7994f07e..ec8f6ff3 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -211,6 +211,9 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s "parent_time_series_id", "field_visit_id", "field_measurements_series_id", + # ``get_stats_*`` (statistics service) output — per-computation + # UUID; ``parent_time_series_id`` is already listed above. + "computation_id", } ) @@ -1620,6 +1623,7 @@ def get_stats_data( service: str, expand_percentiles: bool, client: httpx.Client | None = None, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Retrieves statistical data from a specified endpoint and returns it @@ -1641,6 +1645,13 @@ def get_stats_data( each percentile gets its own row in the returned dataframe. If True and user requests a computation_type other than percentiles, a percentile column is still returned. + include_hash_ids : bool, optional + If False (default), the per-computation UUID (``computation_id``) + and the upstream time-series hex hash (``parent_time_series_id``) + are dropped from the returned DataFrame. These IDs are not + stable across record refreshes; ``computation_id`` is used as a + join key internally during percentile expansion and only + removed after that step completes. Returns ------- @@ -1686,6 +1697,17 @@ def follow_up(cursor: str, client: httpx.Client) -> httpx.Response: if expand_percentiles: df = _expand_percentiles(df) + + # Drop hash-valued ID columns at the end (after + # ``_expand_percentiles``, which still needs ``computation_id`` + # as a merge key while it explodes the percentile lists into + # rows). Stable identifiers (``monitoring_location_id``, + # ``parameter_code``, ``time_of_year``, …) are kept. + if not include_hash_ids: + drop_cols = [col for col in df.columns if col in _HASH_ID_COLUMNS] + if drop_cols: + df = df.drop(columns=drop_cols) + return df, BaseMetadata(response) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index db94a819..5ac65a74 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -621,13 +621,31 @@ def test_get_stats_por_expanded_false(): computation_type=["minimum", "percentile"], ) assert df.shape[0] == 4 - assert df.shape[1] == 20 # if geopandas installed, 21 columns if not + # Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols. + assert df.shape[1] == 18 + assert "computation_id" not in df.columns + assert "parent_time_series_id" not in df.columns assert "percentile" not in df.columns assert "percentiles" in df.columns assert type(df["percentiles"][2]) is list assert df.loc[~df["percentiles"].isna(), "value"].isnull().all() +def test_get_stats_por_include_hash_ids(): + """``include_hash_ids=True`` preserves the per-computation UUID + and the upstream time-series hex hash that ``get_stats_*`` used + to return unconditionally.""" + df, _ = get_stats_por( + monitoring_location_id="USGS-12451000", + parameter_code="00060", + start_date="01-01", + end_date="01-01", + include_hash_ids=True, + ) + assert "computation_id" in df.columns + assert "parent_time_series_id" in df.columns + + def test_get_stats_date_range(): df, _ = get_stats_date_range( monitoring_location_id="USGS-12451000", @@ -638,7 +656,10 @@ def test_get_stats_date_range(): ) assert df.shape[0] == 3 - assert df.shape[1] == 20 # if geopandas installed, 21 columns if not + # Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols. + assert df.shape[1] == 18 + assert "computation_id" not in df.columns + assert "parent_time_series_id" not in df.columns assert "interval_type" in df.columns assert "percentile" in df.columns assert df["interval_type"].isin(["month", "calendar_year", "water_year"]).all() From 2d01ae3ee5c324d01710d1642afca68e34fae5fd Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 17:22:37 -0500 Subject: [PATCH 03/24] test(waterdata): add stats hash-drop unit tests Two mocked-response tests for ``get_stats_data``: - ``test_get_stats_data_drops_hash_ids_by_default`` asserts ``computation_id`` and ``parent_time_series_id`` are removed when ``include_hash_ids=False`` (the new default). - ``test_get_stats_data_keeps_hash_ids_when_opted_in`` asserts the opt-in path preserves them, matching the legacy behavior. Both use ``monkeypatch`` to stub ``_handle_stats_nesting`` so the fake response only needs to carry the column shape, not the full nested-percentile body. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_utils_test.py | 82 +++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index aa38a447..5b12c494 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -376,6 +376,88 @@ def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): assert any("tok2" in m for m in warnings_), warnings_ +def test_get_stats_data_drops_hash_ids_by_default(monkeypatch): + """``get_stats_data`` drops ``computation_id`` and + ``parent_time_series_id`` from the result by default — the + ``include_hash_ids=False`` counterpart for the stats path.""" + from dataretrieval.waterdata.utils import get_stats_data + + monkeypatch.setattr( + _utils_module, + "_handle_stats_nesting", + mock.MagicMock( + return_value=pd.DataFrame( + { + "monitoring_location_id": ["USGS-1"], + "parameter_code": ["00060"], + "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], + "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], + "value": [1.0], + } + ) + ), + ) + + page1 = mock.MagicMock() + page1.status_code = 200 + page1.json.return_value = {"next": None, "features": []} + page1.elapsed = __import__("datetime").timedelta(milliseconds=1) + page1.headers = {} + page1.url = "https://example/stats" + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = page1 + + df, _ = get_stats_data( + args={"monitoring_location_id": "USGS-1"}, + service="observationNormals", + expand_percentiles=False, + client=client, + ) + assert "computation_id" not in df.columns + assert "parent_time_series_id" not in df.columns + assert "monitoring_location_id" in df.columns + assert "parameter_code" in df.columns + assert "value" in df.columns + + +def test_get_stats_data_keeps_hash_ids_when_opted_in(monkeypatch): + """``include_hash_ids=True`` preserves the legacy stats columns.""" + from dataretrieval.waterdata.utils import get_stats_data + + monkeypatch.setattr( + _utils_module, + "_handle_stats_nesting", + mock.MagicMock( + return_value=pd.DataFrame( + { + "monitoring_location_id": ["USGS-1"], + "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], + "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], + } + ) + ), + ) + + page1 = mock.MagicMock() + page1.status_code = 200 + page1.json.return_value = {"next": None, "features": []} + page1.elapsed = __import__("datetime").timedelta(milliseconds=1) + page1.headers = {} + page1.url = "https://example/stats" + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = page1 + + df, _ = get_stats_data( + args={"monitoring_location_id": "USGS-1"}, + service="observationNormals", + expand_percentiles=False, + client=client, + include_hash_ids=True, + ) + assert "computation_id" in df.columns + assert "parent_time_series_id" in df.columns + + def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of the columns we try to drop ("type", "properties.data") is absent, the From fd940d922c23feadaf4b5c1afbe82c859b6935ce Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 17:25:57 -0500 Subject: [PATCH 04/24] feat(waterdata): extend hash-ID drop to get_samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A live-API column-content scan found that the Samples CSV service returns two UUID-valued columns by default: - ``Activity_ActivityIdentifier`` (per-activity UUID) - ``Result_MeasureIdentifier`` (per-measurement UUID) These weren't covered by the prior OGC / stats commits because ``get_samples`` parses CSV directly without going through ``get_ogc_data`` or ``get_stats_data``. This commit: - Adds the two CamelCase column names to ``_HASH_ID_COLUMNS``. - Plumbs ``include_hash_ids: bool = False`` through ``get_samples`` and drops the named columns from the parsed CSV before returning. - Updates ``test_mock_get_samples`` to reflect the new column count (187 → 185) and adds ``test_mock_get_samples_include_hash_ids`` for the opt-in path. - Updates ``test_samples_results`` and ``test_samples_activity`` similarly. Stable identifiers (``Org_Identifier``, ``Location_Identifier``, ``Project_Identifier``, ``USGSpcode``, …) are kept unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 21 +++++++++++++- dataretrieval/waterdata/utils.py | 6 ++++ tests/waterdata_test.py | 47 +++++++++++++++++++++++++++----- 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 4bfa42b3..5cb97ab5 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -30,6 +30,7 @@ SERVICES, ) from dataretrieval.waterdata.utils import ( + _HASH_ID_COLUMNS, SAMPLES_URL, _check_profiles, _default_headers, @@ -2292,6 +2293,7 @@ def get_samples( pointLocationWithinMiles: float | None = None, projectIdentifier: str | Iterable[str] | None = None, recordIdentifierUserSupplied: str | Iterable[str] | None = None, + include_hash_ids: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Search Samples database for USGS water quality data. This is a wrapper function for the Samples database API. All potential @@ -2422,6 +2424,14 @@ def get_samples( recordIdentifierUserSupplied : string or iterable of strings, optional Internal AQS record identifier that returns 1 entry. Only available for the "results" service. + include_hash_ids : boolean, optional + If False (default), the per-activity UUID + (``Activity_ActivityIdentifier``) and per-result UUID + (``Result_MeasureIdentifier``) are dropped from the returned + DataFrame. Stable identifiers (``Org_Identifier``, + ``Location_Identifier``, ``Project_Identifier``, + ``USGSpcode``, …) are kept. Set to True to restore the legacy + behavior of including every column. Returns ------- @@ -2471,7 +2481,7 @@ def get_samples( _check_profiles(service, profile) # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"ssl_check", "profile"}) + params = _get_args(locals(), exclude={"ssl_check", "profile", "include_hash_ids"}) params.update({"mimeType": "text/csv"}) @@ -2495,6 +2505,15 @@ def get_samples( df = pd.read_csv(StringIO(response.text), delimiter=",") df = _attach_datetime_columns(df) + # Drop hash-valued ID columns (``Activity_ActivityIdentifier``, + # ``Result_MeasureIdentifier`` — both UUIDs) by default. + # Stable identifiers like ``Org_Identifier``, + # ``Location_Identifier``, ``Project_Identifier`` are kept. + if not include_hash_ids: + drop_cols = [c for c in df.columns if c in _HASH_ID_COLUMNS] + if drop_cols: + df = df.drop(columns=drop_cols) + return df, BaseMetadata(response) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index ec8f6ff3..14e9d9c8 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -214,6 +214,12 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # ``get_stats_*`` (statistics service) output — per-computation # UUID; ``parent_time_series_id`` is already listed above. "computation_id", + # ``get_samples`` (Samples database CSV) — per-activity and + # per-result UUIDs. The Samples service uses CamelCase column + # names rather than snake_case, but the drop logic only needs + # exact name matches so they share this set. + "Activity_ActivityIdentifier", + "Result_MeasureIdentifier", } ) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 5ac65a74..32acd6ae 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -85,8 +85,14 @@ def test_mock_get_samples(httpx_mock): monitoringLocationIdentifier="USGS-05406500", ) assert type(df) is DataFrame - # 181 source columns + 6 derived DateTime columns - assert df.shape == (67, 187) + # 181 source columns + 6 derived DateTime columns − 2 hash IDs + # (Activity_ActivityIdentifier, Result_MeasureIdentifier) dropped by default. + assert df.shape == (67, 185) + assert "Activity_ActivityIdentifier" not in df.columns + assert "Result_MeasureIdentifier" not in df.columns + # Stable identifiers are preserved. + assert "Location_Identifier" in df.columns + assert "Org_Identifier" in df.columns assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header.get("mock_header") == "value" @@ -94,6 +100,29 @@ def test_mock_get_samples(httpx_mock): assert df["Activity_StartDateTime"].notna().any() +def test_mock_get_samples_include_hash_ids(httpx_mock): + """``include_hash_ids=True`` restores the legacy column set.""" + request_url = ( + "https://api.waterdata.usgs.gov/samples-data/results/fullphyschem?" + "activityMediaName=Water&activityStartDateLower=2020-01-01" + "&activityStartDateUpper=2024-12-31&monitoringLocationIdentifier=USGS-05406500&mimeType=text%2Fcsv" + ) + response_file_path = "tests/data/samples_results.txt" + mock_request(httpx_mock, request_url, response_file_path) + df, _md = get_samples( + service="results", + profile="fullphyschem", + activityMediaName="Water", + activityStartDateLower="2020-01-01", + activityStartDateUpper="2024-12-31", + monitoringLocationIdentifier="USGS-05406500", + include_hash_ids=True, + ) + assert df.shape == (67, 187) + assert "Activity_ActivityIdentifier" in df.columns + assert "Result_MeasureIdentifier" in df.columns + + def test_mock_get_samples_summary(httpx_mock): """Tests USGS Samples summary query""" request_url = ( @@ -219,10 +248,11 @@ def test_samples_results(): activityStartDateLower="2024-10-01", activityStartDateUpper="2025-04-24", ) - assert all( - col in df.columns - for col in ["Location_Identifier", "Activity_ActivityIdentifier"] - ) + # Stable identifiers are kept; hash IDs (Activity_ActivityIdentifier, + # Result_MeasureIdentifier) are dropped by default. + assert "Location_Identifier" in df.columns + assert "Activity_ActivityIdentifier" not in df.columns + assert "Result_MeasureIdentifier" not in df.columns assert len(df) > 0 @@ -234,7 +264,10 @@ def test_samples_activity(): monitoringLocationIdentifier="USGS-06719505", ) assert len(df) > 0 - assert len(df.columns) == 97 + # 97 → 96 cols after dropping Activity_ActivityIdentifier + # (Result_MeasureIdentifier is not in the ``activities`` profile). + assert len(df.columns) == 96 + assert "Activity_ActivityIdentifier" not in df.columns assert "Location_HUCTwelveDigitCode" in df.columns From 5bb96d380886a3beaf7fdb132f951bff817eb349 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 18:16:54 -0500 Subject: [PATCH 05/24] refactor(waterdata): unify hash-drop helper and tighten internals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Post-PR-281 cleanup from a code review pass. No behavior change for users; the three code paths that drop hash columns (OGC, stats, samples) now share one helper, and a few smaller wins. - New ``_drop_hash_columns(df, include_hash_ids, keep=None)`` helper replaces three near-identical drop blocks across ``_arrange_cols``, ``get_stats_data``, and ``get_samples``. Uses ``set(df.columns) & _HASH_ID_COLUMNS - keep`` (one Index intersection) in place of the per-call list-comprehension + ``if drop_cols:`` truthiness guard. ``df.drop`` accepts an empty Index, so the guard was unnecessary. - ``_HASH_ID_COLUMNS`` no longer leaks into ``api.py`` — the samples branch now calls the helper instead of touching the constant. - New ``_properties_unspecified(properties)`` extracts the ``properties is None or all(pd.isna(properties))`` predicate so the five call sites that need it stop drifting between ``not properties`` and ``properties is None`` variants. - ``_default_non_hash_properties`` memoizes its result by ``(service, output_id)`` via ``_default_props_cache``. The previous version rebuilt a ~30–100-item list on every OGC call after the queryables cache was warm; this saves the per-call list rebuild on the hot path. - ``get_ogc_data`` flattens its nested ``try/except + if/else`` branches into a single ``use_server_trim`` flag, removing the duplicated ``_switch_properties_id`` assignment. - The benchmark no longer clears ``_queryables_cache`` between measured rounds. Clearing per round only penalizes the default path (the legacy path doesn't consult the cache), so the previous comparison was pessimistic against the default. Real-world callers pay the queryables fetch once per process. - The benchmark's ``capture_fixtures`` now calls ``_default_non_hash_properties`` directly instead of reimplementing the filter, so the "trimmed" fixture matches what the runtime actually sends bit-for-bit. All 157 unit and mock tests pass after the refactor. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/bench_include_hash_ids.py | 43 +++---- dataretrieval/waterdata/api.py | 12 +- dataretrieval/waterdata/utils.py | 163 ++++++++++++++------------- 3 files changed, 110 insertions(+), 108 deletions(-) diff --git a/benchmarks/bench_include_hash_ids.py b/benchmarks/bench_include_hash_ids.py index 9dff5c7b..6609e3c3 100644 --- a/benchmarks/bench_include_hash_ids.py +++ b/benchmarks/bench_include_hash_ids.py @@ -79,10 +79,15 @@ def __str__(self) -> str: def time_call(label: str, **kwargs) -> RunResult: """One end-to-end ``get_continuous`` call, with wall time, peak RSS - and final DataFrame memory captured.""" - # Reset the queryables cache so each configuration pays the same - # one-time schema-fetch cost (when it applies). - wd_utils._queryables_cache.clear() + and final DataFrame memory captured. + + Note: ``_queryables_cache`` is left warm across measured rounds — + clearing it each round would charge the default-path with an extra + HTTP request per round while the legacy path (which doesn't use + the cache) gets a free pass, inflating the default-path wall time. + Real-world callers issue many queries per process and pay the + queryables fetch only once. + """ gc.collect() tracemalloc.start() @@ -135,20 +140,15 @@ def capture_fixtures() -> None: session = requests.Session() headers = wd_utils._default_headers() - # Queryables (used only by the default code path). - q_url = f"{wd_utils.OGC_API_URL}/collections/continuous/queryables" - print(f"Fetching {q_url}") - r = session.get(q_url, headers=headers) - r.raise_for_status() - FIXTURE_QUERYABLES.write_bytes(r.content) - - body = json.loads(r.content) - all_props = list(body.get("properties", {}).keys()) - non_hash = [ - p - for p in all_props - if p not in wd_utils._HASH_ID_COLUMNS and p != "geometry" and p != "id" - ] + # Queryables (used only by the default code path). Fetched via the + # cached helper so the trimmed-payload request below sends the + # *exact* same property list the runtime would — otherwise the + # benchmark's "trimmed" fixture could drift from production. + print("Fetching queryables …") + non_hash = wd_utils._default_non_hash_properties("continuous", "continuous_id") + # Mirror the bytes back into a fixture for ``time_offline`` to load. + q_body = {"properties": {p: {} for p in wd_utils._service_queryables("continuous")}} + FIXTURE_QUERYABLES.write_bytes(json.dumps(q_body).encode()) base = f"{wd_utils.OGC_API_URL}/collections/continuous/items" common = { @@ -187,10 +187,13 @@ def time_offline(label: str, payload_path: Path, include_hash_ids: bool) -> RunR """Measure parsing + DataFrame construction time on a captured payload. ``client.send`` is patched to return the recorded response, so this isolates the local-CPU portion of a call from - network variability and rate-limit pressure.""" + network variability and rate-limit pressure. + + ``_queryables_cache`` and ``_default_props_cache`` are left warm + across rounds (see ``time_call`` for the rationale). + """ body = payload_path.read_bytes() queryables_body = FIXTURE_QUERYABLES.read_bytes() - wd_utils._queryables_cache.clear() gc.collect() def _send(req, *args, **kwargs): diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 5cb97ab5..d1afa102 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -30,10 +30,10 @@ SERVICES, ) from dataretrieval.waterdata.utils import ( - _HASH_ID_COLUMNS, SAMPLES_URL, _check_profiles, _default_headers, + _drop_hash_columns, _get_args, get_ogc_data, get_stats_data, @@ -2504,15 +2504,7 @@ def get_samples( df = pd.read_csv(StringIO(response.text), delimiter=",") df = _attach_datetime_columns(df) - - # Drop hash-valued ID columns (``Activity_ActivityIdentifier``, - # ``Result_MeasureIdentifier`` — both UUIDs) by default. - # Stable identifiers like ``Org_Identifier``, - # ``Location_Identifier``, ``Project_Identifier`` are kept. - if not include_hash_ids: - drop_cols = [c for c in df.columns if c in _HASH_ID_COLUMNS] - if drop_cols: - df = df.drop(columns=drop_cols) + df = _drop_hash_columns(df, include_hash_ids) return df, BaseMetadata(response) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 14e9d9c8..e24f592d 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -228,6 +228,11 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # ``include_hash_ids=False``. Keyed by service name; value is the full # list of property names the server exposes for that collection. _queryables_cache: dict[str, list[str]] = {} +# Cache of the derived non-hash property whitelist, keyed by +# ``(service, output_id)``. Both inputs determine the result, and +# both are stable per call site — re-deriving on every OGC request +# would do ~30–100 frozenset lookups per call for no reason. +_default_props_cache: dict[tuple[str, str], list[str]] = {} def _service_queryables(service: str) -> list[str]: @@ -252,28 +257,57 @@ def _default_non_hash_properties(service: str, output_id: str) -> list[str]: """Build the ``properties=`` whitelist sent to the server when the caller didn't supply one and ``include_hash_ids=False``. - Returns the service's queryables minus: - - any column whose wire-format name is in :data:`_HASH_ID_COLUMNS` - (the secondary hashes like ``time_series_id``, - ``parent_time_series_id``, ``field_visit_id``); - - the wire-format ``"id"`` column, but only when the service's - ``output_id`` (its post-rename name) is itself a hash column — - i.e., for ``daily``/``continuous``/``peaks``/etc., where ``id`` - becomes ``daily_id``/``continuous_id``/``peak_id``. For - ``monitoring-locations`` (where ``id`` becomes the AGENCY-ID - ``monitoring_location_id``) the ``id`` column is kept; - - ``"geometry"`` (the OGC server returns geometry via the feature - envelope, not as a property — listing it would be redundant and - some collections reject it). + The whitelist is the service's queryables minus :data:`_HASH_ID_COLUMNS`, + minus ``"geometry"`` (the OGC server returns geometry via the feature + envelope, not as a property — some collections reject it as a + property name), and minus the wire-format ``"id"`` column when the + service's ``output_id`` is itself a hash column (e.g. ``daily_id``). + For ``monitoring-locations``, ``id`` becomes the AGENCY-ID + ``monitoring_location_id``, so it's kept. """ + key = (service, output_id) + cached = _default_props_cache.get(key) + if cached is not None: + return cached drop_wire_id = output_id in _HASH_ID_COLUMNS - return [ + props = [ p for p in _service_queryables(service) if p not in _HASH_ID_COLUMNS and p != "geometry" and not (drop_wire_id and p == "id") ] + _default_props_cache[key] = props + return props + + +def _properties_unspecified(properties) -> bool: + """True when the caller didn't pin a ``properties`` list. + + A ``None``, empty list, or list-of-only-NaN counts as unspecified. + Centralizes the predicate so the (subtly different) ``not properties`` + vs ``properties is None`` variants across call sites stay aligned. + """ + return not properties or all(pd.isna(properties)) + + +def _drop_hash_columns( + df: pd.DataFrame, + include_hash_ids: bool, + keep: set[str] | None = None, +) -> pd.DataFrame: + """Drop hash-valued ID columns from ``df`` when not opting in. + + When ``include_hash_ids`` is True, returns ``df`` unchanged. Otherwise + drops every column whose name is in :data:`_HASH_ID_COLUMNS`, except + those the caller listed in ``keep`` (e.g. names appearing in an + explicit user ``properties=`` request — explicit beats default). + A no-op when no hash columns are present. + """ + if include_hash_ids: + return df + drop = (set(df.columns) & _HASH_ID_COLUMNS) - (keep or set()) + return df.drop(columns=drop) if drop else df def _parse_datetime(value: str) -> datetime | None: @@ -1234,7 +1268,9 @@ def _arrange_cols( # Rename id column to output_id df = df.rename(columns={"id": output_id}) - if properties and not all(pd.isna(properties)): + user_specified = not _properties_unspecified(properties) + + if user_specified: # Don't alias the caller's list — we mutate below. local_properties = list(properties) if "geometry" in df.columns and "geometry" not in local_properties: @@ -1245,49 +1281,32 @@ def _arrange_cols( local_properties[local_properties.index("id")] = output_id df = df.loc[:, [col for col in local_properties if col in df.columns]] - # Default: drop hash-valued ID columns the caller didn't ask for. - # This is the client-side counterpart to the server-side - # ``properties=`` trim done in ``get_ogc_data``; it's a no-op on - # the happy path (the server already omitted them) but catches the - # fallback case where the queryables fetch failed and we - # round-tripped a full payload. Drops apply uniformly even when the - # hash column is the service's renamed ``output_id`` (e.g., - # ``daily_id``) — the user-meaningful identifiers - # (``monitoring_location_id`` + ``time`` + ``parameter_code`` + - # ``statistic_id``) are sufficient to pin a row. - if not include_hash_ids: - requested = ( - set(properties) if properties and not all(pd.isna(properties)) else set() - ) - # ``"id"`` in ``properties`` resolves to the renamed ``output_id`` - # (matching the rename done above and in ``_switch_properties_id``), - # so treat the user as having asked for that output_id too. - if "id" in requested: - requested.add(output_id) - drop_cols = [ - col - for col in df.columns - if col in _HASH_ID_COLUMNS and col not in requested - ] - if drop_cols: - df = df.drop(columns=drop_cols) + # Client-side safety net for the server-side trim done in + # ``get_ogc_data``: no-op on the happy path (server already omitted + # hash columns), drops them here when the queryables fetch failed + # and we fell back to a full payload. An explicit caller + # ``properties`` list — including ``"id"``, which resolved to + # ``output_id`` above — wins over the default. + keep: set[str] = set() + if user_specified: + keep = set(properties) + if "id" in keep: + keep.add(output_id) + df = _drop_hash_columns(df, include_hash_ids, keep=keep) # Legacy ordering: when ``include_hash_ids=True`` and ``properties`` - # is unspecified, move the per-record version IDs to the end of the - # DataFrame so they don't crowd the front. With - # ``include_hash_ids=False`` those columns are gone above, so this - # branch becomes a no-op. - extra_id_col = set(df.columns).intersection( - { - "latest_continuous_id", - "latest_daily_id", - "daily_id", - "continuous_id", - "field_measurement_id", - } - ) + # is unspecified, move the per-record version IDs to the end so they + # don't crowd the front. With ``include_hash_ids=False`` those + # columns are gone above, so this branch is a no-op. + extra_id_col = set(df.columns) & { + "latest_continuous_id", + "latest_daily_id", + "daily_id", + "continuous_id", + "field_measurement_id", + } - if extra_id_col and (properties is None or all(pd.isna(properties))): + if extra_id_col and _properties_unspecified(properties): id_col_order = [col for col in df.columns if col not in extra_id_col] + list( extra_id_col ) @@ -1408,29 +1427,23 @@ def get_ogc_data( include_hash_ids = args.pop("include_hash_ids", False) # When the caller didn't pin ``properties`` and isn't opting into - # hash IDs, send a server-side whitelist of the non-hash columns so - # the server (a) skips serializing UUID/hex fields and (b) returns - # a smaller payload for us to parse. ``_arrange_cols`` still sees - # ``properties=None`` (the original user input), so columns retain - # the schema's natural order rather than being subset to whatever - # whitelist we synthesized here. - if not include_hash_ids and (properties is None or all(pd.isna(properties))): + # hash IDs, try a server-side whitelist of the non-hash columns so + # the server skips serializing UUID/hex fields. On any queryables + # failure, fall through to the full payload — ``_arrange_cols`` + # post-processes the drop as a safety net. + use_server_trim = not include_hash_ids and _properties_unspecified(properties) + if use_server_trim: try: args["properties"] = _default_non_hash_properties(service, output_id) except (requests.HTTPError, requests.RequestException, ValueError) as exc: - # Server-side trim is an optimization, not a correctness - # requirement — fall back to a full payload and rely on the - # ``_arrange_cols`` post-processing drop below. logger.warning( "Could not fetch queryables for %s (%s); " "falling back to client-side hash-ID drop.", service, exc, ) - args["properties"] = _switch_properties_id( - properties, id_name=output_id, service=service - ) - else: + use_server_trim = False + if not use_server_trim: args["properties"] = _switch_properties_id( properties, id_name=output_id, service=service ) @@ -1704,15 +1717,9 @@ def follow_up(cursor: str, client: httpx.Client) -> httpx.Response: if expand_percentiles: df = _expand_percentiles(df) - # Drop hash-valued ID columns at the end (after - # ``_expand_percentiles``, which still needs ``computation_id`` - # as a merge key while it explodes the percentile lists into - # rows). Stable identifiers (``monitoring_location_id``, - # ``parameter_code``, ``time_of_year``, …) are kept. - if not include_hash_ids: - drop_cols = [col for col in df.columns if col in _HASH_ID_COLUMNS] - if drop_cols: - df = df.drop(columns=drop_cols) + # Drop hash IDs after ``_expand_percentiles`` — it merges on + # ``computation_id`` while exploding the percentile lists. + df = _drop_hash_columns(df, include_hash_ids) return df, BaseMetadata(response) From b6ca21132fcea4db3e0d2f039943be8e3d0d5775 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 17 May 2026 20:29:59 -0500 Subject: [PATCH 06/24] chore(waterdata): drop benchmark scaffolding from PR The ``benchmarks/`` directory was useful during development for measuring the hash-ID-drop impact, but it's not part of the runtime or test surface and the numbers are captured in the PR description. Removing to keep the diff focused on the library change. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/_fixtures/.gitignore | 4 - benchmarks/_fixtures/synthesize.py | 123 --------- benchmarks/bench_include_hash_ids.py | 359 --------------------------- benchmarks/results_offline.txt | 22 -- 4 files changed, 508 deletions(-) delete mode 100644 benchmarks/_fixtures/.gitignore delete mode 100644 benchmarks/_fixtures/synthesize.py delete mode 100644 benchmarks/bench_include_hash_ids.py delete mode 100644 benchmarks/results_offline.txt diff --git a/benchmarks/_fixtures/.gitignore b/benchmarks/_fixtures/.gitignore deleted file mode 100644 index 5efdd79b..00000000 --- a/benchmarks/_fixtures/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Synthesized or captured response payloads are large (≈10–15 MB each) -# and trivially regenerated. Keep them out of the repo; the script that -# produces them is committed. -*.json diff --git a/benchmarks/_fixtures/synthesize.py b/benchmarks/_fixtures/synthesize.py deleted file mode 100644 index 72e07ffb..00000000 --- a/benchmarks/_fixtures/synthesize.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Generate synthetic OGC API payloads for the offline benchmark. - -We can't always reach the live USGS API (rate limits, no token), but -the local cost of an ``include_hash_ids=False`` vs ``True`` call is -dominated by: - - JSON parsing (``response.json()``) - - ``pandas.json_normalize`` over the features list - - DataFrame column allocation - -All three scale with payload bytes and feature count. A synthetic -payload that mirrors the real wire format and the real per-row column -shape is sufficient to measure them. -""" - -from __future__ import annotations - -import json -import uuid -from datetime import datetime, timedelta, timezone -from pathlib import Path - -ROWS = 30000 # ~1 year of 15-minute continuous data -HERE = Path(__file__).parent - - -def _row(i: int) -> dict: - ts = datetime(2024, 1, 1, tzinfo=timezone.utc) + timedelta(minutes=15 * i) - return { - "id": str(uuid.uuid4()), - "time_series_id": uuid.uuid4().hex, - "monitoring_location_id": "USGS-02238500", - "parameter_code": "00060", - "statistic_id": "00011", - "time": ts.strftime("%Y-%m-%dT%H:%M:%S+00:00"), - "value": f"{100.0 + 0.01 * (i % 1000):.2f}", - "unit_of_measure": "ft^3/s", - "approval_status": "Approved", - "qualifier": None, - "last_modified": "2026-05-01T00:00:00+00:00", - } - - -def _feature(props: dict) -> dict: - return { - "type": "Feature", - "properties": props, - "id": props.get("id", ""), - "geometry": None, - } - - -HASH_COLS = {"id", "time_series_id"} - - -def build_full() -> dict: - features = [_feature(_row(i)) for i in range(ROWS)] - return { - "type": "FeatureCollection", - "features": features, - "numberReturned": ROWS, - "links": [], - } - - -def build_trimmed() -> dict: - features = [] - for i in range(ROWS): - props = {k: v for k, v in _row(i).items() if k not in HASH_COLS} - features.append( - { - "type": "Feature", - "properties": props, - "id": "", - "geometry": None, - } - ) - return { - "type": "FeatureCollection", - "features": features, - "numberReturned": ROWS, - "links": [], - } - - -def build_queryables() -> dict: - return { - "properties": { - "geometry": {}, - "id": {}, - "time_series_id": {}, - "monitoring_location_id": {}, - "parameter_code": {}, - "statistic_id": {}, - "time": {}, - "value": {}, - "unit_of_measure": {}, - "approval_status": {}, - "qualifier": {}, - "last_modified": {}, - } - } - - -def main() -> None: - HERE.mkdir(exist_ok=True) - full = build_full() - trimmed = build_trimmed() - queryables = build_queryables() - - (HERE / "continuous_full.json").write_text(json.dumps(full)) - (HERE / "continuous_trimmed.json").write_text(json.dumps(trimmed)) - (HERE / "continuous_queryables.json").write_text(json.dumps(queryables)) - - full_size = (HERE / "continuous_full.json").stat().st_size - trim_size = (HERE / "continuous_trimmed.json").stat().st_size - pct = 100 * (full_size - trim_size) / full_size - print(f"rows: {ROWS:,}") - print(f"full: {full_size:>12,} bytes") - print(f"trimmed: {trim_size:>12,} bytes ({pct:.1f}% smaller)") - - -if __name__ == "__main__": - main() diff --git a/benchmarks/bench_include_hash_ids.py b/benchmarks/bench_include_hash_ids.py deleted file mode 100644 index 6609e3c3..00000000 --- a/benchmarks/bench_include_hash_ids.py +++ /dev/null @@ -1,359 +0,0 @@ -"""Benchmark: default (``include_hash_ids=False``) vs legacy -(``include_hash_ids=True``) on a large ``get_daily`` query. - -The two settings are functionally equivalent except for the presence of -UUID/hex-hash ID columns in the response. The hash columns are: - - ``daily_id`` — 36-char UUID, one per record - - ``time_series_id`` — 32-char hex hash, one per record - -For a 50,000-row response that's 68 bytes/row of hash, plus JSON -overhead — roughly 4 MB of payload that we now neither transfer nor -parse. The expected wins: - - smaller HTTP payload (server-side ``properties=`` trim) - - fewer columns to ``json_normalize`` in pandas - - smaller DataFrame footprint - -Run with:: - - API_USGS_PAT= python benchmarks/bench_include_hash_ids.py - -Without a token, USGS rate-limits to ~120 requests/hour which is enough -for a single comparison run but not for retrying. The script clears the -queryables cache between runs so the schema-fetch cost is amortized -across both configurations. -""" - -from __future__ import annotations - -import argparse -import gc -import json -import sys -import time -import tracemalloc -from dataclasses import dataclass -from pathlib import Path -from unittest import mock - -import requests - -from dataretrieval.waterdata import get_continuous -from dataretrieval.waterdata import utils as wd_utils - -# A long-running, high-frequency gage. Continuous (sub-hourly) records -# yield O(10⁴) rows per year per parameter — the time window below is -# tuned so a single query returns ~one page worth of data, large enough -# that JSON parsing/DataFrame construction dominates over network -# round-trip variability. -SITE = "USGS-02238500" -PARAMETER = "00060" -# ~1 year of 15-min flow data ≈ 35,000 rows, just under one page. -TIME_RANGE = "2023-01-01/2024-01-01" - -# Where to stash the captured payload for the offline benchmark mode. -FIXTURE_DIR = Path(__file__).parent / "_fixtures" -FIXTURE_FULL = FIXTURE_DIR / "continuous_full.json" -FIXTURE_TRIMMED = FIXTURE_DIR / "continuous_trimmed.json" -FIXTURE_QUERYABLES = FIXTURE_DIR / "continuous_queryables.json" - - -@dataclass -class RunResult: - label: str - wall_seconds: float - rows: int - cols: int - mem_peak_bytes: int - memory_usage_bytes: int - - def __str__(self) -> str: - return ( - f" {self.label:>32}: " - f"{self.wall_seconds:6.2f}s " - f"rows={self.rows:>7} " - f"cols={self.cols:>2} " - f"peak_mem={self.mem_peak_bytes / 1024 / 1024:6.1f} MB " - f"df_mem={self.memory_usage_bytes / 1024 / 1024:6.1f} MB" - ) - - -def time_call(label: str, **kwargs) -> RunResult: - """One end-to-end ``get_continuous`` call, with wall time, peak RSS - and final DataFrame memory captured. - - Note: ``_queryables_cache`` is left warm across measured rounds — - clearing it each round would charge the default-path with an extra - HTTP request per round while the legacy path (which doesn't use - the cache) gets a free pass, inflating the default-path wall time. - Real-world callers issue many queries per process and pay the - queryables fetch only once. - """ - gc.collect() - - tracemalloc.start() - start = time.perf_counter() - df, _md = get_continuous( - monitoring_location_id=SITE, - parameter_code=PARAMETER, - time=TIME_RANGE, - **kwargs, - ) - wall = time.perf_counter() - start - _current, peak = tracemalloc.get_traced_memory() - tracemalloc.stop() - - return RunResult( - label=label, - wall_seconds=wall, - rows=len(df), - cols=df.shape[1], - mem_peak_bytes=peak, - memory_usage_bytes=int(df.memory_usage(deep=True).sum()), - ) - - -def _make_mock_response(body_bytes: bytes, status: int = 200) -> mock.Mock: - """Build a ``requests.Response``-shaped mock backed by ``body_bytes``. - - Only the attributes ``_walk_pages`` and ``_get_resp_data`` touch - need to be real; ``elapsed``, ``status_code``, ``headers``, ``json()``, - and ``raise_for_status()`` cover those callers. Pagination is - forced single-page by stripping any "next" link from the body - before the test patches it in. - """ - resp = mock.Mock(spec=requests.Response) - resp.status_code = status - resp.headers = {"x-ratelimit-remaining": "1000"} - resp.elapsed = __import__("datetime").timedelta(milliseconds=1) - resp.url = "https://test/mock" - resp.text = body_bytes.decode("utf-8") - resp.json = lambda: json.loads(body_bytes) - resp.raise_for_status = lambda: None - return resp - - -def capture_fixtures() -> None: - """Snapshot the two response payloads (with and without hash IDs) - plus the queryables response, so the ``--offline`` benchmark can - parse them locally with no network calls.""" - FIXTURE_DIR.mkdir(exist_ok=True) - session = requests.Session() - headers = wd_utils._default_headers() - - # Queryables (used only by the default code path). Fetched via the - # cached helper so the trimmed-payload request below sends the - # *exact* same property list the runtime would — otherwise the - # benchmark's "trimmed" fixture could drift from production. - print("Fetching queryables …") - non_hash = wd_utils._default_non_hash_properties("continuous", "continuous_id") - # Mirror the bytes back into a fixture for ``time_offline`` to load. - q_body = {"properties": {p: {} for p in wd_utils._service_queryables("continuous")}} - FIXTURE_QUERYABLES.write_bytes(json.dumps(q_body).encode()) - - base = f"{wd_utils.OGC_API_URL}/collections/continuous/items" - common = { - "monitoring_location_id": SITE, - "parameter_code": PARAMETER, - "time": TIME_RANGE, - "skipGeometry": True, - "limit": 50000, - } - - # Full payload (legacy behavior — every column). - print("Fetching full payload …") - r = session.get(base, headers=headers, params=common) - r.raise_for_status() - FIXTURE_FULL.write_bytes(r.content) - print(f" → {FIXTURE_FULL.name}: {len(r.content):,} bytes") - - # Trimmed payload (new default — non-hash columns only). - print("Fetching trimmed payload …") - params = dict(common, properties=",".join(non_hash)) - r = session.get(base, headers=headers, params=params) - r.raise_for_status() - FIXTURE_TRIMMED.write_bytes(r.content) - print(f" → {FIXTURE_TRIMMED.name}: {len(r.content):,} bytes") - - full_size = FIXTURE_FULL.stat().st_size - trim_size = FIXTURE_TRIMMED.stat().st_size - pct = 100 * (full_size - trim_size) / full_size - print() - print( - f"Server payload size: {full_size:,} → {trim_size:,} bytes ({pct:.1f}% smaller)" - ) - - -def time_offline(label: str, payload_path: Path, include_hash_ids: bool) -> RunResult: - """Measure parsing + DataFrame construction time on a captured - payload. ``client.send`` is patched to return the recorded - response, so this isolates the local-CPU portion of a call from - network variability and rate-limit pressure. - - ``_queryables_cache`` and ``_default_props_cache`` are left warm - across rounds (see ``time_call`` for the rationale). - """ - body = payload_path.read_bytes() - queryables_body = FIXTURE_QUERYABLES.read_bytes() - gc.collect() - - def _send(req, *args, **kwargs): - return _make_mock_response(body) - - def _get(url, *args, **kwargs): - return _make_mock_response(queryables_body) - - with ( - mock.patch.object(requests.Session, "send", _send), - mock.patch.object(requests, "get", _get), - ): - tracemalloc.start() - start = time.perf_counter() - df, _md = get_continuous( - monitoring_location_id=SITE, - parameter_code=PARAMETER, - time=TIME_RANGE, - include_hash_ids=include_hash_ids, - ) - wall = time.perf_counter() - start - _current, peak = tracemalloc.get_traced_memory() - tracemalloc.stop() - - return RunResult( - label=label, - wall_seconds=wall, - rows=len(df), - cols=df.shape[1], - mem_peak_bytes=peak, - memory_usage_bytes=int(df.memory_usage(deep=True).sum()), - ) - - -def main() -> int: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--capture", - action="store_true", - help="Fetch and save the two response payloads as fixtures, then exit.", - ) - parser.add_argument( - "--offline", - action="store_true", - help="Use captured fixtures instead of hitting the live API. " - "Isolates parsing/DataFrame cost from network variability.", - ) - parser.add_argument( - "--rounds", - type=int, - default=3, - help="Number of measured rounds per configuration (default: 3).", - ) - args = parser.parse_args() - - if args.capture: - capture_fixtures() - return 0 - - if args.offline: - if not ( - FIXTURE_FULL.exists() - and FIXTURE_TRIMMED.exists() - and FIXTURE_QUERYABLES.exists() - ): - print( - "Missing fixtures. Run with --capture first to record them.", - file=sys.stderr, - ) - return 1 - - full_size = FIXTURE_FULL.stat().st_size - trim_size = FIXTURE_TRIMMED.stat().st_size - print( - f"Offline benchmark on fixtures " - f"(full: {full_size:,} B, trimmed: {trim_size:,} B, " - f"server-side savings: {100 * (full_size - trim_size) / full_size:.1f}%)" - ) - print() - - # Warm up to load pandas/geopandas/numpy code paths. - time_offline("warmup_default", FIXTURE_TRIMMED, include_hash_ids=False) - time_offline("warmup_legacy", FIXTURE_FULL, include_hash_ids=True) - - runs = [] - for _ in range(args.rounds): - runs.append( - time_offline( - "default (hash IDs dropped)", - FIXTURE_TRIMMED, - include_hash_ids=False, - ) - ) - runs.append( - time_offline( - "include_hash_ids=True", - FIXTURE_FULL, - include_hash_ids=True, - ) - ) - else: - print( - f"Benchmarking get_continuous(site={SITE!r}, parameter={PARAMETER!r}, " - f"time={TIME_RANGE!r})" - ) - print() - - # Warmup once with each configuration to absorb DNS/TLS/cache - # cold-start effects, then run measured rounds. - print("Warming up …") - time_call("warmup_default") - time_call("warmup_legacy", include_hash_ids=True) - - runs = [] - for _ in range(args.rounds): - runs.append(time_call("default (hash IDs dropped)")) - runs.append(time_call("include_hash_ids=True", include_hash_ids=True)) - - print("All runs:") - for r in runs: - print(r) - print() - - best_default = min( - (r for r in runs if r.label.startswith("default")), - key=lambda r: r.wall_seconds, - ) - best_legacy = min( - (r for r in runs if r.label.startswith("include_hash_ids")), - key=lambda r: r.wall_seconds, - ) - - print("Best of each:") - print(best_default) - print(best_legacy) - print() - - wall_delta = best_legacy.wall_seconds - best_default.wall_seconds - wall_pct = ( - 100 * wall_delta / best_legacy.wall_seconds if best_legacy.wall_seconds else 0.0 - ) - mem_delta = best_legacy.memory_usage_bytes - best_default.memory_usage_bytes - mem_pct = ( - 100 * mem_delta / best_legacy.memory_usage_bytes - if best_legacy.memory_usage_bytes - else 0.0 - ) - peak_delta = best_legacy.mem_peak_bytes - best_default.mem_peak_bytes - peak_pct = ( - 100 * peak_delta / best_legacy.mem_peak_bytes - if best_legacy.mem_peak_bytes - else 0.0 - ) - - print(f"Wall-clock speedup: {wall_delta:+.2f}s ({wall_pct:+.1f}%)") - print(f"DataFrame memory: {mem_delta / 1024 / 1024:+.1f} MB ({mem_pct:+.1f}%)") - print(f"Peak traced memory: {peak_delta / 1024 / 1024:+.1f} MB ({peak_pct:+.1f}%)") - print(f"Columns dropped: {best_legacy.cols - best_default.cols}") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/benchmarks/results_offline.txt b/benchmarks/results_offline.txt deleted file mode 100644 index e7e10919..00000000 --- a/benchmarks/results_offline.txt +++ /dev/null @@ -1,22 +0,0 @@ -Offline benchmark on fixtures (full: 14,310,081 B, trimmed: 10,230,081 B, server-side savings: 28.5%) - -All runs: - default (hash IDs dropped): 0.96s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.09s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.08s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.06s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - default (hash IDs dropped): 0.97s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.09s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - default (hash IDs dropped): 0.97s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.05s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - -Best of each: - default (hash IDs dropped): 0.94s rows= 30000 cols= 9 peak_mem= 73.9 MB df_mem= 9.4 MB - include_hash_ids=True: 1.05s rows= 30000 cols=11 peak_mem= 94.1 MB df_mem= 14.2 MB - -Wall-clock speedup: +0.11s (+10.8%) -DataFrame memory: +4.7 MB (+33.5%) -Peak traced memory: +20.2 MB (+21.5%) -Columns dropped: 2 From 8591740e393df4137c9df412806c2934de67eb12 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 16:47:07 -0500 Subject: [PATCH 07/24] docs(waterdata): correct include_hash_ids stability claims The include_hash_ids docstrings claimed the dropped columns "are not stable across record refreshes." That is true for the per-record version UUID (daily_id/continuous_id/computation_id), which is regenerated on every refresh, but wrong for the secondary hashes (time_series_id, parent_time_series_id, field_series_id/field_measurements_series_id): real-data checks confirm these are stable and are the documented join keys back to the metadata endpoints (e.g. time_series_id links a values row to get_time_series_metadata; the API docs recommend (time, time_series_id) to identify an observation over time). Rewrite the 11 OGC getter blocks, the _HASH_ID_COLUMNS comment, and the get_stats_data block to distinguish the unstable per-record UUID from the stable-but-opaque join keys, and tell callers to set include_hash_ids=True (or name the column in properties) when they need to join, trace series lineage, or disambiguate series sharing the same (monitoring_location_id, parameter_code, statistic_id). Docs only; no behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 275 ++++++++++++++++++------------- dataretrieval/waterdata/utils.py | 30 ++-- 2 files changed, 184 insertions(+), 121 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index d1afa102..3b940947 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -196,18 +196,23 @@ def get_daily( convert_type : boolean, optional If True, converts columns to appropriate types. include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- @@ -420,18 +425,23 @@ def get_continuous( If True, the function will convert the data to dates and qualifier to string vector include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- @@ -742,18 +752,23 @@ def get_monitoring_locations( convert_type : boolean, optional If True, converts columns to appropriate types. include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- @@ -979,18 +994,23 @@ def get_time_series_metadata( convert_type : boolean, optional If True, converts columns to appropriate types. include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- @@ -1190,18 +1210,23 @@ def get_combined_metadata( convert_type : boolean, optional If True, converts columns to appropriate types. include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- @@ -1421,18 +1446,23 @@ def get_latest_continuous( convert_type : boolean, optional If True, converts columns to appropriate types. include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- @@ -1632,18 +1662,23 @@ def get_latest_daily( convert_type : boolean, optional If True, converts columns to appropriate types. include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- @@ -1834,18 +1869,23 @@ def get_field_measurements( convert_type : boolean, optional If True, converts columns to appropriate types. include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- @@ -1966,18 +2006,23 @@ def get_field_measurements_metadata( convert_type : boolean, optional If True, converts columns to appropriate types. include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- @@ -2104,18 +2149,23 @@ def get_peaks( convert_type : boolean, optional If True, converts columns to appropriate types. include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- @@ -3005,18 +3055,23 @@ def get_channel( If True, the function will convert the data to dates and qualifier to string vector include_hash_ids : boolean, optional - If False (default), hash-valued ID columns (the per-record UUID - used as the row's primary key, plus secondary hash columns such - as ``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, and ``field_measurements_series_id``) are - omitted from the response. These IDs are not stable across - record refreshes and are not human-meaningful; dropping them - also shrinks the server payload for large queries. Stable, + If False (default), hash-valued ID columns are omitted from the + response. Two kinds are dropped. The per-record UUID used as the + row's primary key (e.g. ``daily_id``) is regenerated every time a + record is refreshed and so is not stable over time. The secondary + hash columns (``time_series_id``, ``parent_time_series_id``, + ``field_visit_id``, ``field_measurements_series_id``) are stable + but opaque, and they bloat the payload of large queries. Stable, human-meaningful identifiers like ``monitoring_location_id``, ``parameter_code``, and ``statistic_id`` are always returned. - Set to True to restore the pre-existing behavior of including - every column. Listing a hash column explicitly in - ``properties`` also overrides this default for that column. + Note that the secondary hashes are the join keys back to the + metadata endpoints -- ``time_series_id`` links a values row to + ``get_time_series_metadata`` (and ``parent_time_series_id`` links + a derived series to its source). Set this to True, or name the + specific column in ``properties``, when you need those keys to + join, to trace series lineage, or to disambiguate series that + share the same ``(monitoring_location_id, parameter_code, + statistic_id)``. Returns ------- diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index e24f592d..ef58e630 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -182,17 +182,23 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s _CQL2_REQUIRED_SERVICES = frozenset({"monitoring-locations"}) # Column names whose values are server-generated hashes (UUIDs or hex -# digests). These are unstable across record refreshes — joining or -# diffing on them produces spurious churn — and they bloat the payload -# of large queries. Dropped by default; opt in with -# ``include_hash_ids=True``. Includes both: +# digests): opaque, non-human-meaningful, and a payload-bloat on large +# queries. Dropped by default; opt in with ``include_hash_ids=True``. +# Includes two kinds with different stability: # - The per-record version UUIDs that are aliased to a service's -# ``output_id`` (``daily_id``, ``continuous_id``, …). These get +# ``output_id`` (``daily_id``, ``continuous_id``, …). These are +# regenerated on every record refresh, so they are NOT stable over +# time and joining/diffing on them produces spurious churn. They get # mapped to/from ``"id"`` on the wire; both names are listed so the # filter works on either side of ``_switch_properties_id``. # - Secondary hash columns embedded in record payloads # (``time_series_id``, ``field_visit_id``, ``parent_time_series_id``, -# ``field_measurements_series_id``). +# ``field_measurements_series_id``). These ARE stable and are the +# join keys back to the metadata endpoints (e.g. ``time_series_id`` +# links a values row to ``get_time_series_metadata``); they're +# dropped only because they're opaque and bloat the payload, so a +# caller who needs to join sets ``include_hash_ids=True`` or names +# the column in ``properties``. # ``monitoring_location_id`` (AGENCY-ID format, e.g. ``USGS-01646500``) # and other code columns (``parameter_code``, ``statistic_id``, …) are # intentionally absent — they're stable, human-meaningful identifiers. @@ -1666,11 +1672,13 @@ def get_stats_data( percentiles, a percentile column is still returned. include_hash_ids : bool, optional If False (default), the per-computation UUID (``computation_id``) - and the upstream time-series hex hash (``parent_time_series_id``) - are dropped from the returned DataFrame. These IDs are not - stable across record refreshes; ``computation_id`` is used as a - join key internally during percentile expansion and only - removed after that step completes. + and the upstream time-series hash (``parent_time_series_id``) are + dropped from the returned DataFrame. ``computation_id`` is a + per-record version UUID that is not stable across refreshes (it is + used as a join key internally during percentile expansion and only + removed after that step completes). ``parent_time_series_id`` is a + stable join key back to ``get_time_series_metadata``. Set this to + True to keep both. Returns ------- From c5973fc35b3f1924546532b876c8f3a9b1586263 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 16:57:45 -0500 Subject: [PATCH 08/24] refactor(waterdata): rename include_hash_ids to include_hash; tighten docs Rename the opt-in flag include_hash_ids -> include_hash across the OGC getters, get_samples, get_stats_*, the internal helpers, and the tests. The flag is new in this PR (never on main), so this is not a breaking change. Collapse the per-getter docstrings to two lines. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 302 +++++++------------------------ dataretrieval/waterdata/utils.py | 48 +++-- tests/waterdata_test.py | 20 +- tests/waterdata_utils_test.py | 18 +- 4 files changed, 101 insertions(+), 287 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 3b940947..d9521e24 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -62,7 +62,7 @@ def get_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -195,24 +195,9 @@ def get_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -296,7 +281,7 @@ def get_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Continuous data provide instantaneous water conditions. @@ -424,24 +409,9 @@ def get_continuous( convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -535,7 +505,7 @@ def get_monitoring_locations( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Location information is basic information about the monitoring location including the name, identifier, agency responsible for data collection, and @@ -751,24 +721,9 @@ def get_monitoring_locations( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -832,7 +787,7 @@ def get_time_series_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data and continuous measurements are grouped into time series, which represent a collection of observations of a single parameter, @@ -993,24 +948,9 @@ def get_time_series_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1108,7 +1048,7 @@ def get_combined_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get combined monitoring-location and time-series metadata. @@ -1209,24 +1149,9 @@ def get_combined_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1315,7 +1240,7 @@ def get_latest_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """This endpoint provides the most recent observation for each time series of continuous data. Continuous data are collected via automated sensors @@ -1445,24 +1370,9 @@ def get_latest_continuous( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1529,7 +1439,7 @@ def get_latest_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -1661,24 +1571,9 @@ def get_latest_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1746,7 +1641,7 @@ def get_field_measurements( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Field measurements are physically measured values collected during a visit to the monitoring location. Field measurements consist of measurements @@ -1868,24 +1763,9 @@ def get_field_measurements( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1949,7 +1829,7 @@ def get_field_measurements_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get field-measurement metadata: one row per (location, parameter) series. @@ -2005,24 +1885,9 @@ def get_field_measurements_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -2089,7 +1954,7 @@ def get_peaks( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get the annual peak streamflow / stage record for a monitoring location. @@ -2148,24 +2013,9 @@ def get_peaks( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -2343,7 +2193,7 @@ def get_samples( pointLocationWithinMiles: float | None = None, projectIdentifier: str | Iterable[str] | None = None, recordIdentifierUserSupplied: str | Iterable[str] | None = None, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Search Samples database for USGS water quality data. This is a wrapper function for the Samples database API. All potential @@ -2474,14 +2324,9 @@ def get_samples( recordIdentifierUserSupplied : string or iterable of strings, optional Internal AQS record identifier that returns 1 entry. Only available for the "results" service. - include_hash_ids : boolean, optional - If False (default), the per-activity UUID - (``Activity_ActivityIdentifier``) and per-result UUID - (``Result_MeasureIdentifier``) are dropped from the returned - DataFrame. Stable identifiers (``Org_Identifier``, - ``Location_Identifier``, ``Project_Identifier``, - ``USGSpcode``, …) are kept. Set to True to restore the legacy - behavior of including every column. + include_hash : boolean, optional + If False (default), drop the opaque per-activity / per-result UUID columns + (``Activity_ActivityIdentifier``, ``Result_MeasureIdentifier``). Returns ------- @@ -2531,7 +2376,7 @@ def get_samples( _check_profiles(service, profile) # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"ssl_check", "profile", "include_hash_ids"}) + params = _get_args(locals(), exclude={"ssl_check", "profile", "include_hash"}) params.update({"mimeType": "text/csv"}) @@ -2554,7 +2399,7 @@ def get_samples( df = pd.read_csv(StringIO(response.text), delimiter=",") df = _attach_datetime_columns(df) - df = _drop_hash_columns(df, include_hash_ids) + df = _drop_hash_columns(df, include_hash) return df, BaseMetadata(response) @@ -2647,7 +2492,7 @@ def get_stats_por( site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, expand_percentiles: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get day-of-year and month-of-year water data statistics from the USGS Water Data API. @@ -2726,13 +2571,9 @@ def get_stats_por( argument will return both the "values" column, containing the list of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. - include_hash_ids : boolean, optional - If False (default), the per-computation UUID (``computation_id``) - and the upstream time-series hex hash (``parent_time_series_id``) - are dropped from the returned DataFrame. Stable identifiers - (``monitoring_location_id``, ``parameter_code``, the time keys) - are kept. Set to True to restore the legacy behavior of - including every column. + include_hash : boolean, optional + If False (default), drop the hash columns (``computation_id``, + ``parent_time_series_id``); set True to keep them for joining to metadata. Examples -------- @@ -2757,13 +2598,13 @@ def get_stats_por( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"}) + params = _get_args(locals(), exclude={"expand_percentiles", "include_hash"}) return get_stats_data( args=params, service="observationNormals", expand_percentiles=expand_percentiles, - include_hash_ids=include_hash_ids, + include_hash=include_hash, ) @@ -2782,7 +2623,7 @@ def get_stats_date_range( site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, expand_percentiles: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get monthly and annual water data statistics from the USGS Water Data API. This service (called the "observationIntervals" endpoint on api.waterdata.usgs.gov) @@ -2865,13 +2706,9 @@ def get_stats_date_range( argument will return both the "values" column, containing the list of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. - include_hash_ids : boolean, optional - If False (default), the per-computation UUID (``computation_id``) - and the upstream time-series hex hash (``parent_time_series_id``) - are dropped from the returned DataFrame. Stable identifiers - (``monitoring_location_id``, ``parameter_code``, the time keys) - are kept. Set to True to restore the legacy behavior of - including every column. + include_hash : boolean, optional + If False (default), drop the hash columns (``computation_id``, + ``parent_time_series_id``); set True to keep them for joining to metadata. Examples -------- @@ -2897,13 +2734,13 @@ def get_stats_date_range( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles", "include_hash_ids"}) + params = _get_args(locals(), exclude={"expand_percentiles", "include_hash"}) return get_stats_data( args=params, service="observationIntervals", expand_percentiles=expand_percentiles, - include_hash_ids=include_hash_ids, + include_hash=include_hash, ) @@ -2939,7 +2776,7 @@ def get_channel( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Channel measurements taken as part of streamflow field measurements. @@ -3054,24 +2891,9 @@ def get_channel( convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector - include_hash_ids : boolean, optional - If False (default), hash-valued ID columns are omitted from the - response. Two kinds are dropped. The per-record UUID used as the - row's primary key (e.g. ``daily_id``) is regenerated every time a - record is refreshed and so is not stable over time. The secondary - hash columns (``time_series_id``, ``parent_time_series_id``, - ``field_visit_id``, ``field_measurements_series_id``) are stable - but opaque, and they bloat the payload of large queries. Stable, - human-meaningful identifiers like ``monitoring_location_id``, - ``parameter_code``, and ``statistic_id`` are always returned. - Note that the secondary hashes are the join keys back to the - metadata endpoints -- ``time_series_id`` links a values row to - ``get_time_series_metadata`` (and ``parent_time_series_id`` links - a derived series to its source). Set this to True, or name the - specific column in ``properties``, when you need those keys to - join, to trace series lineage, or to disambiguate series that - share the same ``(monitoring_location_id, parameter_code, - statistic_id)``. + include_hash : boolean, optional + If False (default), drop the opaque hash-valued ID columns. Set True to + keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index ef58e630..4dd7b681 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -183,7 +183,7 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # Column names whose values are server-generated hashes (UUIDs or hex # digests): opaque, non-human-meaningful, and a payload-bloat on large -# queries. Dropped by default; opt in with ``include_hash_ids=True``. +# queries. Dropped by default; opt in with ``include_hash=True``. # Includes two kinds with different stability: # - The per-record version UUIDs that are aliased to a service's # ``output_id`` (``daily_id``, ``continuous_id``, …). These are @@ -197,7 +197,7 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # join keys back to the metadata endpoints (e.g. ``time_series_id`` # links a values row to ``get_time_series_metadata``); they're # dropped only because they're opaque and bloat the payload, so a -# caller who needs to join sets ``include_hash_ids=True`` or names +# caller who needs to join sets ``include_hash=True`` or names # the column in ``properties``. # ``monitoring_location_id`` (AGENCY-ID format, e.g. ``USGS-01646500``) # and other code columns (``parameter_code``, ``statistic_id``, …) are @@ -231,7 +231,7 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # Cache of per-service queryables column lists, populated on first call # from each service when computing the default ``properties=`` for -# ``include_hash_ids=False``. Keyed by service name; value is the full +# ``include_hash=False``. Keyed by service name; value is the full # list of property names the server exposes for that collection. _queryables_cache: dict[str, list[str]] = {} # Cache of the derived non-hash property whitelist, keyed by @@ -246,7 +246,7 @@ def _service_queryables(service: str) -> list[str]: One HTTP GET per service per process; the list is reused for every subsequent call. Raises ``requests.HTTPError`` on a non-200 — the - caller's ``include_hash_ids=False`` request can't be satisfied + caller's ``include_hash=False`` request can't be satisfied without it, so failing loudly is preferable to silently dropping the server-side trim. """ @@ -261,7 +261,7 @@ def _service_queryables(service: str) -> list[str]: def _default_non_hash_properties(service: str, output_id: str) -> list[str]: """Build the ``properties=`` whitelist sent to the server when the - caller didn't supply one and ``include_hash_ids=False``. + caller didn't supply one and ``include_hash=False``. The whitelist is the service's queryables minus :data:`_HASH_ID_COLUMNS`, minus ``"geometry"`` (the OGC server returns geometry via the feature @@ -299,18 +299,18 @@ def _properties_unspecified(properties) -> bool: def _drop_hash_columns( df: pd.DataFrame, - include_hash_ids: bool, + include_hash: bool, keep: set[str] | None = None, ) -> pd.DataFrame: """Drop hash-valued ID columns from ``df`` when not opting in. - When ``include_hash_ids`` is True, returns ``df`` unchanged. Otherwise + When ``include_hash`` is True, returns ``df`` unchanged. Otherwise drops every column whose name is in :data:`_HASH_ID_COLUMNS`, except those the caller listed in ``keep`` (e.g. names appearing in an explicit user ``properties=`` request — explicit beats default). A no-op when no hash columns are present. """ - if include_hash_ids: + if include_hash: return df drop = (set(df.columns) & _HASH_ID_COLUMNS) - (keep or set()) return df.drop(columns=drop) if drop else df @@ -1241,7 +1241,7 @@ def _arrange_cols( df: pd.DataFrame, properties: list[str] | None, output_id: str, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> pd.DataFrame: """ Rearranges and renames columns in a DataFrame based on provided @@ -1256,7 +1256,7 @@ def _arrange_cols( only NaN, the function renames 'id' to output_id. output_id : str The name to which the 'id' column should be renamed if applicable. - include_hash_ids : bool, optional + include_hash : bool, optional If False (default), hash-valued ID columns (see :data:`_HASH_ID_COLUMNS`) are dropped from the result unless the caller explicitly named them in ``properties``. If True, the @@ -1298,11 +1298,11 @@ def _arrange_cols( keep = set(properties) if "id" in keep: keep.add(output_id) - df = _drop_hash_columns(df, include_hash_ids, keep=keep) + df = _drop_hash_columns(df, include_hash, keep=keep) - # Legacy ordering: when ``include_hash_ids=True`` and ``properties`` + # Legacy ordering: when ``include_hash=True`` and ``properties`` # is unspecified, move the per-record version IDs to the end so they - # don't crowd the front. With ``include_hash_ids=False`` those + # don't crowd the front. With ``include_hash=False`` those # columns are gone above, so this branch is a no-op. extra_id_col = set(df.columns) & { "latest_continuous_id", @@ -1430,14 +1430,14 @@ def get_ogc_data( # the user-facing names, not the wire-format ones. properties = args.get("properties") convert_type = args.pop("convert_type", False) - include_hash_ids = args.pop("include_hash_ids", False) + include_hash = args.pop("include_hash", False) # When the caller didn't pin ``properties`` and isn't opting into # hash IDs, try a server-side whitelist of the non-hash columns so # the server skips serializing UUID/hex fields. On any queryables # failure, fall through to the full payload — ``_arrange_cols`` # post-processes the drop as a safety net. - use_server_trim = not include_hash_ids and _properties_unspecified(properties) + use_server_trim = not include_hash and _properties_unspecified(properties) if use_server_trim: try: args["properties"] = _default_non_hash_properties(service, output_id) @@ -1461,7 +1461,7 @@ def get_ogc_data( if convert_type: return_list = _type_cols(return_list) return_list = _arrange_cols( - return_list, properties, output_id, include_hash_ids=include_hash_ids + return_list, properties, output_id, include_hash=include_hash ) return_list = _sort_rows(return_list) @@ -1648,7 +1648,7 @@ def get_stats_data( service: str, expand_percentiles: bool, client: httpx.Client | None = None, - include_hash_ids: bool = False, + include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Retrieves statistical data from a specified endpoint and returns it @@ -1670,15 +1670,9 @@ def get_stats_data( each percentile gets its own row in the returned dataframe. If True and user requests a computation_type other than percentiles, a percentile column is still returned. - include_hash_ids : bool, optional - If False (default), the per-computation UUID (``computation_id``) - and the upstream time-series hash (``parent_time_series_id``) are - dropped from the returned DataFrame. ``computation_id`` is a - per-record version UUID that is not stable across refreshes (it is - used as a join key internally during percentile expansion and only - removed after that step completes). ``parent_time_series_id`` is a - stable join key back to ``get_time_series_metadata``. Set this to - True to keep both. + include_hash : bool, optional + If False (default), drop the hash columns (``computation_id``, + ``parent_time_series_id``); set True to keep them for joining to metadata. Returns ------- @@ -1727,7 +1721,7 @@ def follow_up(cursor: str, client: httpx.Client) -> httpx.Response: # Drop hash IDs after ``_expand_percentiles`` — it merges on # ``computation_id`` while exploding the percentile lists. - df = _drop_hash_columns(df, include_hash_ids) + df = _drop_hash_columns(df, include_hash) return df, BaseMetadata(response) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 32acd6ae..d5d9b5eb 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -100,8 +100,8 @@ def test_mock_get_samples(httpx_mock): assert df["Activity_StartDateTime"].notna().any() -def test_mock_get_samples_include_hash_ids(httpx_mock): - """``include_hash_ids=True`` restores the legacy column set.""" +def test_mock_get_samples_include_hash(httpx_mock): + """``include_hash=True`` restores the legacy column set.""" request_url = ( "https://api.waterdata.usgs.gov/samples-data/results/fullphyschem?" "activityMediaName=Water&activityStartDateLower=2020-01-01" @@ -116,7 +116,7 @@ def test_mock_get_samples_include_hash_ids(httpx_mock): activityStartDateLower="2020-01-01", activityStartDateUpper="2024-12-31", monitoringLocationIdentifier="USGS-05406500", - include_hash_ids=True, + include_hash=True, ) assert df.shape == (67, 187) assert "Activity_ActivityIdentifier" in df.columns @@ -330,15 +330,15 @@ def test_get_daily(): assert df["value"].dtype == "float64" -def test_get_daily_include_hash_ids(): - """``include_hash_ids=True`` restores the legacy behavior: the +def test_get_daily_include_hash(): + """``include_hash=True`` restores the legacy behavior: the per-record UUID (``daily_id``) and secondary hashes (``time_series_id``) are present.""" df, _ = get_daily( monitoring_location_id="USGS-05427718", parameter_code="00060", time="2025-01-01/..", - include_hash_ids=True, + include_hash=True, ) assert "daily_id" in df.columns assert "time_series_id" in df.columns @@ -409,7 +409,7 @@ def test_get_continuous(): and "UTC" in df["time"].dtype.name ) # Default: continuous_id (UUID) and time_series_id (hex hash) are - # dropped. Set ``include_hash_ids=True`` to keep them. + # dropped. Set ``include_hash=True`` to keep them. assert "continuous_id" not in df.columns assert "time_series_id" not in df.columns assert "monitoring_location_id" in df.columns @@ -664,8 +664,8 @@ def test_get_stats_por_expanded_false(): assert df.loc[~df["percentiles"].isna(), "value"].isnull().all() -def test_get_stats_por_include_hash_ids(): - """``include_hash_ids=True`` preserves the per-computation UUID +def test_get_stats_por_include_hash(): + """``include_hash=True`` preserves the per-computation UUID and the upstream time-series hex hash that ``get_stats_*`` used to return unconditionally.""" df, _ = get_stats_por( @@ -673,7 +673,7 @@ def test_get_stats_por_include_hash_ids(): parameter_code="00060", start_date="01-01", end_date="01-01", - include_hash_ids=True, + include_hash=True, ) assert "computation_id" in df.columns assert "parent_time_series_id" in df.columns diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 5b12c494..6f9b6f76 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -379,7 +379,7 @@ def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): def test_get_stats_data_drops_hash_ids_by_default(monkeypatch): """``get_stats_data`` drops ``computation_id`` and ``parent_time_series_id`` from the result by default — the - ``include_hash_ids=False`` counterpart for the stats path.""" + ``include_hash=False`` counterpart for the stats path.""" from dataretrieval.waterdata.utils import get_stats_data monkeypatch.setattr( @@ -421,7 +421,7 @@ def test_get_stats_data_drops_hash_ids_by_default(monkeypatch): def test_get_stats_data_keeps_hash_ids_when_opted_in(monkeypatch): - """``include_hash_ids=True`` preserves the legacy stats columns.""" + """``include_hash=True`` preserves the legacy stats columns.""" from dataretrieval.waterdata.utils import get_stats_data monkeypatch.setattr( @@ -452,7 +452,7 @@ def test_get_stats_data_keeps_hash_ids_when_opted_in(monkeypatch): service="observationNormals", expand_percentiles=False, client=client, - include_hash_ids=True, + include_hash=True, ) assert "computation_id" in df.columns assert "parent_time_series_id" in df.columns @@ -614,7 +614,7 @@ def test_arrange_cols_keeps_geometry_when_present(): def test_arrange_cols_drops_hash_ids_by_default(): - """Default ``include_hash_ids=False`` drops the per-record UUID + """Default ``include_hash=False`` drops the per-record UUID (renamed to ``daily_id``) and secondary hash columns (``time_series_id``), keeping stable identifiers.""" df = pd.DataFrame( @@ -632,8 +632,8 @@ def test_arrange_cols_drops_hash_ids_by_default(): assert "value" in result.columns -def test_arrange_cols_include_hash_ids_keeps_them(): - """``include_hash_ids=True`` preserves the legacy behavior — hash +def test_arrange_cols_include_hash_keeps_them(): + """``include_hash=True`` preserves the legacy behavior — hash columns are kept and the per-record UUID lands at the end of the column order.""" df = pd.DataFrame( @@ -644,9 +644,7 @@ def test_arrange_cols_include_hash_ids_keeps_them(): "value": [1.0], } ) - result = _arrange_cols( - df, properties=None, output_id="daily_id", include_hash_ids=True - ) + result = _arrange_cols(df, properties=None, output_id="daily_id", include_hash=True) assert "daily_id" in result.columns assert "time_series_id" in result.columns # Legacy ordering: ``daily_id`` moves to the end. @@ -655,7 +653,7 @@ def test_arrange_cols_include_hash_ids_keeps_them(): def test_arrange_cols_explicit_properties_keep_hash_ids(): """A user who lists a hash column in ``properties`` gets it back even - with the default ``include_hash_ids=False`` — explicit beats default.""" + with the default ``include_hash=False`` — explicit beats default.""" df = pd.DataFrame( { "id": ["uuid-a"], From 26052048b7a5caaaaf2ece193dbd88c67c0ad6f3 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 21:05:50 -0500 Subject: [PATCH 09/24] feat(waterdata): add waterdata.xarray module returning CF datasets New optional module dataretrieval.waterdata.xarray mirrors the time-series getters (get_daily, get_continuous, get_latest_continuous/daily, get_nearest_continuous, get_peaks, get_field_measurements, get_stats_por/ date_range) but returns a CF-conventions xarray.Dataset instead of a DataFrame. Each wrapper calls the underlying getter with include_hash=True, looks up the series descriptors from the metadata endpoints (cached per site), and writes them onto the dataset: one data variable per parameter with long_name, units (UDUNITS where mapped), cell_methods (from the statistic/computation), and standard_name where a confident USGS-pcode -> CF mapping exists. The monitoring location is the CF discrete-sampling-geometry instance dimension (cf_role=timeseries_id) with longitude/latitude coords from point geometry; qualifier/approval_status ride along as ancillary variables; dataset attrs carry Conventions/provenance/request URL. Parameters on different clocks are outer-joined on a shared time axis; a (site, parameter) collision falls back to time_series_id as the instance dimension. xarray is an optional dependency (pip install dataretrieval[xarray]); the module raises a clear ImportError if it is missing and is not imported by dataretrieval.waterdata, so the core package stays xarray-free. Stats use a preliminary flat conversion. Adds offline converter tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/xarray.py | 455 ++++++++++++++++++++++++++++++ pyproject.toml | 3 + tests/waterdata_xarray_test.py | 185 ++++++++++++ 3 files changed, 643 insertions(+) create mode 100644 dataretrieval/waterdata/xarray.py create mode 100644 tests/waterdata_xarray_test.py diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py new file mode 100644 index 00000000..8d81bf87 --- /dev/null +++ b/dataretrieval/waterdata/xarray.py @@ -0,0 +1,455 @@ +"""xarray-returning wrappers for the Water Data time-series getters. + +Each public function here mirrors the same-named function in +:mod:`dataretrieval.waterdata`, but returns a CF-conventions +:class:`xarray.Dataset` instead of a :class:`pandas.DataFrame`. The series +descriptors that the plain getters leave behind (parameter name, units, +statistic/computation) are looked up automatically from the metadata +endpoints and written onto the dataset: + +* each observed parameter becomes a data variable with ``long_name``, + ``units`` (UDUNITS where a mapping is known), ``cell_methods`` (derived + from the statistic/computation), and ``standard_name`` where a confident + USGS-parameter-code -> CF mapping exists; +* the monitoring location is the CF "discrete sampling geometry" instance + dimension (``cf_role = "timeseries_id"``), with ``longitude`` / ``latitude`` + coordinates when point geometry is available; +* dataset-level attributes carry ``Conventions``, provenance, and the + request URL. + +The wrappers call the underlying getter with ``include_hash=True`` so the +join key survives, perform the metadata lookup, then drop the opaque hash +columns from the user-facing result. + +This module requires the optional ``xarray`` dependency:: + + pip install dataretrieval[xarray] +""" + +from __future__ import annotations + +import datetime as _dt +import re as _re +import warnings as _warnings +from functools import wraps as _wraps + +import pandas as _pd + +try: + import xarray as _xr +except ModuleNotFoundError as _exc: # pragma: no cover - exercised only sans xarray + raise ModuleNotFoundError( + "dataretrieval.waterdata.xarray requires the optional 'xarray' " + "dependency. Install it with: pip install dataretrieval[xarray]" + ) from _exc + +from . import api as _api +from .nearest import get_nearest_continuous as _get_nearest_continuous + +__all__ = [ + "get_continuous", + "get_daily", + "get_field_measurements", + "get_latest_continuous", + "get_latest_daily", + "get_nearest_continuous", + "get_peaks", + "get_stats_date_range", + "get_stats_por", +] + + +# --- CF mapping tables ----------------------------------------------------- +# Each is intentionally partial: anything not listed falls back to a sensible +# default (raw unit string kept verbatim; no standard_name emitted) rather +# than guessing and emitting a wrong CF term. + +# USGS unit strings -> UDUNITS / CF-canonical form. +_UDUNITS = { + "ft^3/s": "ft3 s-1", + "ft3/s": "ft3 s-1", + "ft": "ft", + "in": "in", + "degC": "degC", + "deg C": "degC", + "uS/cm": "uS/cm", + "mg/l": "mg L-1", + "mg/L": "mg L-1", + "tons/day": "short_ton day-1", + "%": "percent", +} + +# computation_identifier -> the operator in a CF ``cell_methods`` string. +_CELL_METHOD = { + "Mean": "mean", + "Sum": "sum", + "Maximum": "maximum", + "Max At Event Time": "maximum", + "Minimum": "minimum", + "Median": "median", + "Instantaneous": "point", +} + +# USGS 5-digit parameter code -> CF standard_name. Deliberately conservative; +# codes without a confident match are left without a standard_name. +_STANDARD_NAME = { + "00060": "water_volume_transport_in_river_channel", + "00010": "water_temperature", + "00065": "water_surface_height_above_reference_datum", + "63160": "water_surface_height_above_reference_datum", + "00045": "lwe_thickness_of_precipitation_amount", +} + +# Columns kept off the value pivot but surfaced as ancillary (flag) variables. +_ANCILLARY = ("qualifier", "approval_status") + + +# --- metadata lookups (cached per process) --------------------------------- + +_TS_META_CACHE: dict[str, dict[str, dict]] = {} +_FIELD_META_CACHE: dict[str, dict[str, dict]] = {} + +_TS_DESCRIPTORS = ( + "parameter_code", + "parameter_name", + "parameter_description", + "unit_of_measure", + "statistic_id", + "computation_period_identifier", + "computation_identifier", +) +_FIELD_DESCRIPTORS = ("parameter_code", "parameter_name", "parameter_description") + + +def _lookup(site_ids, cache, getter, id_col, descriptors): + """Fetch and cache metadata descriptors keyed by the series id column. + + Returns a flat ``{series_id: {descriptor: value}}`` dict covering every + requested site. One network call per not-yet-cached batch of sites; the + cache is keyed by site so repeated getter calls reuse it. + """ + sites = sorted({str(s) for s in site_ids if _pd.notna(s)}) + todo = [s for s in sites if s not in cache] + if todo: + meta, _ = getter(monitoring_location_id=todo, include_hash=True) + for s in todo: + cache[s] = {} + if not meta.empty: + cols = [c for c in descriptors if c in meta.columns] + for _, row in meta.iterrows(): + site = row.get("monitoring_location_id") + sid = row.get(id_col) + if site in cache and _pd.notna(sid): + cache[site][sid] = {c: row.get(c) for c in cols} + out: dict[str, dict] = {} + for s in sites: + out.update(cache.get(s, {})) + return out + + +def _timeseries_metadata(site_ids): + return _lookup( + site_ids, + _TS_META_CACHE, + _api.get_time_series_metadata, + "time_series_id", + _TS_DESCRIPTORS, + ) + + +def _field_metadata(site_ids): + return _lookup( + site_ids, + _FIELD_META_CACHE, + _api.get_field_measurements_metadata, + "field_series_id", + _FIELD_DESCRIPTORS, + ) + + +# --- helpers --------------------------------------------------------------- + + +def _slug(name) -> str: + """A lower_snake_case, identifier-safe variable name.""" + s = _re.sub(r"[^0-9a-zA-Z]+", "_", str(name).strip().lower()).strip("_") + return s or "value" + + +def _first(series): + """First non-null value of a column, or None.""" + nonnull = series.dropna() + return nonnull.iloc[0] if len(nonnull) else None + + +def _var_attrs(desc, group, *, default_cell_method, pcode, ancillary, name): + """Build the CF attribute dict for one data variable.""" + attrs: dict[str, str] = {} + long_name = desc.get("parameter_description") or desc.get("parameter_name") + if long_name and _pd.notna(long_name): + attrs["long_name"] = str(long_name) + + unit = desc.get("unit_of_measure") + if (unit is None or _pd.isna(unit)) and "unit_of_measure" in group: + unit = _first(group["unit_of_measure"]) + if unit is not None and _pd.notna(unit): + attrs["units"] = _UDUNITS.get(str(unit), str(unit)) + + op = _CELL_METHOD.get(desc.get("computation_identifier"), default_cell_method) + if op: + attrs["cell_methods"] = f"time: {op}" + + if pcode is not None and _pd.notna(pcode): + sn = _STANDARD_NAME.get(str(pcode)) + if sn: + attrs["standard_name"] = sn + attrs["usgs_parameter_code"] = str(pcode) + + stat = desc.get("statistic_id") + if (stat is None or _pd.isna(stat)) and "statistic_id" in group: + stat = _first(group["statistic_id"]) + if stat is not None and _pd.notna(stat): + attrs["usgs_statistic_id"] = str(stat) + + if ancillary: + attrs["ancillary_variables"] = " ".join(f"{name}_{c}" for c in ancillary) + return attrs + + +def _dataset_attrs(service, base_meta): + attrs = { + "Conventions": "CF-1.11", + "institution": "U.S. Geological Survey", + "source": f"USGS Water Data API ({service})", + "featureType": "timeSeries", + "history": ( + f"{_dt.datetime.now(_dt.timezone.utc).isoformat(timespec='seconds')} " + "created by dataretrieval.waterdata.xarray" + ), + } + url = getattr(base_meta, "url", None) + if url: + attrs["references"] = str(url) + return attrs + + +def _empty_dataset(service, base_meta): + ds = _xr.Dataset() + ds.attrs = _dataset_attrs(service, base_meta) + return ds + + +def _point_coords(df, inst): + """lon/lat per instance from a GeoDataFrame point geometry, or None.""" + if "geometry" not in df.columns: + return None + geo = df.dropna(subset=["geometry"]).drop_duplicates(inst) + if geo.empty: + return None + try: + lon = {row[inst]: row["geometry"].x for _, row in geo.iterrows()} + lat = {row[inst]: row["geometry"].y for _, row in geo.iterrows()} + except (AttributeError, TypeError): + return None # non-point geometry; skip rather than guess + return lon, lat + + +def _build_timeseries( + df, + base_meta, + *, + service, + series_meta, + key_col="time_series_id", + group_cols=("parameter_code", "statistic_id"), + default_cell_method=None, +): + """Long values frame -> CF timeSeries Dataset (one var per parameter).""" + if df is None or len(df) == 0: + return _empty_dataset(service, base_meta) + + df = df.copy() + # Normalize to naive-UTC so xarray can store datetime64 (it has no tz dtype). + times = _pd.to_datetime(df["time"], errors="coerce", utc=True) + df["time"] = times.dt.tz_localize(None) + df["value"] = _pd.to_numeric(df["value"], errors="coerce") + group_cols = [c for c in group_cols if c in df.columns] + + # Instance (DSG) dimension: the site, unless two series collide on + # (site, parameter, time) -- then fall back to the unambiguous series id. + inst = "monitoring_location_id" + if key_col in df.columns and df.duplicated(group_cols + [inst, "time"]).any(): + inst = key_col + _warnings.warn( + "multiple time series share a (site, parameter); using " + f"'{key_col}' as the instance dimension instead of " + "'monitoring_location_id'.", + stacklevel=3, + ) + + datasets, used = [], set() + for _, group in df.groupby(group_cols, dropna=False): + sid = _first(group[key_col]) if key_col in group else None + desc = series_meta.get(sid, {}) if sid is not None else {} + pcode = desc.get("parameter_code") or ( + _first(group["parameter_code"]) if "parameter_code" in group else None + ) + + name = _slug(desc.get("parameter_name") or pcode) + if name in used: # disambiguate same-parameter, different-statistic vars + comp = desc.get("computation_identifier") or _first( + group.get("statistic_id", _pd.Series(dtype=object)) + ) + name = f"{name}_{_slug(comp)}" if comp else name + while name in used: + name += "_x" + used.add(name) + + ancillary = [c for c in _ANCILLARY if c in group.columns] + sub = group.set_index([inst, "time"])[["value", *ancillary]] + if not sub.index.is_unique: + sub = sub[~sub.index.duplicated(keep="first")] + ds_g = sub.to_xarray().rename( + {"value": name, **{c: f"{name}_{c}" for c in ancillary}} + ) + ds_g[name].attrs = _var_attrs( + desc, + group, + default_cell_method=default_cell_method, + pcode=pcode, + ancillary=ancillary, + name=name, + ) + datasets.append(ds_g) + + # Outer join on time: parameters sampled on different clocks share a + # union time axis, NaN where a given parameter has no observation. + ds = _xr.merge(datasets, combine_attrs="drop_conflicts", join="outer") + ds.attrs = _dataset_attrs(service, base_meta) + ds["time"].attrs.setdefault("standard_name", "time") + if inst in ds.coords: + ds[inst].attrs.setdefault("cf_role", "timeseries_id") + + coords = _point_coords(df, inst) + if coords is not None: + lon, lat = coords + order = list(ds[inst].values) + ds = ds.assign_coords( + longitude=(inst, [lon.get(k) for k in order]), + latitude=(inst, [lat.get(k) for k in order]), + ) + ds["longitude"].attrs = {"standard_name": "longitude", "units": "degrees_east"} + ds["latitude"].attrs = {"standard_name": "latitude", "units": "degrees_north"} + + # Surface the human-readable site as a coordinate when the id is the dim. + if inst == key_col and "monitoring_location_id" in df.columns: + sites = df.drop_duplicates(key_col).set_index(key_col)["monitoring_location_id"] + ds = ds.assign_coords( + monitoring_location_id=(inst, sites.reindex(ds[inst].values).values) + ) + return ds + + +def _build_stats(df, base_meta, service): + """Best-effort, preliminary conversion of the statistics tables. + + The statistics service returns percentile tables keyed by time-of-year + rather than a (time, value) series, so this produces a flat Dataset (one + variable per column over an ``index`` dimension) with dataset-level + provenance. A richer percentile/day-of-year layout is future work. + """ + if df is None or len(df) == 0: + return _empty_dataset(service, base_meta) + flat = df.drop(columns=[c for c in ("geometry",) if c in df.columns]) + ds = _xr.Dataset.from_dataframe(flat.reset_index(drop=True)) + ds.attrs = _dataset_attrs(service, base_meta) + ds.attrs["comment"] = "preliminary flat conversion; see module docs" + return ds + + +# --- public wrappers ------------------------------------------------------- + + +def _sites(df): + """Unique monitoring-location ids present in a values frame.""" + if "monitoring_location_id" in df: + return df["monitoring_location_id"].unique() + return [] + + +def _xr_doc(func): + """Prepend an xarray note to the wrapped getter's docstring.""" + note = ( + " xarray wrapper: same arguments as " + f"``dataretrieval.waterdata.{func.__name__}``, but returns a\n" + " CF-conventions ``xarray.Dataset`` with series metadata populated.\n\n" + ) + return note + (func.__doc__ or "") + + +def _timeseries_wrapper(func, *, service, default_cell_method=None): + @_wraps(func) + def wrapper(*args, **kwargs): + kwargs.setdefault("include_hash", True) + df, base_meta = func(*args, **kwargs) + return _build_timeseries( + df, + base_meta, + service=service, + series_meta=_timeseries_metadata(_sites(df)), + default_cell_method=default_cell_method, + ) + + wrapper.__doc__ = _xr_doc(func) + return wrapper + + +def _field_wrapper(func, *, service): + @_wraps(func) + def wrapper(*args, **kwargs): + kwargs.setdefault("include_hash", True) + df, base_meta = func(*args, **kwargs) + return _build_timeseries( + df, + base_meta, + service=service, + series_meta=_field_metadata(_sites(df)), + key_col="field_measurements_series_id", + group_cols=("parameter_code",), + default_cell_method="point", + ) + + wrapper.__doc__ = _xr_doc(func) + return wrapper + + +def _stats_wrapper(func, *, service): + @_wraps(func) + def wrapper(*args, **kwargs): + kwargs.setdefault("include_hash", True) + df, base_meta = func(*args, **kwargs) + return _build_stats(df, base_meta, service) + + wrapper.__doc__ = _xr_doc(func) + return wrapper + + +get_daily = _timeseries_wrapper(_api.get_daily, service="daily") +get_continuous = _timeseries_wrapper(_api.get_continuous, service="continuous") +get_latest_continuous = _timeseries_wrapper( + _api.get_latest_continuous, service="latest-continuous", default_cell_method="point" +) +get_latest_daily = _timeseries_wrapper( + _api.get_latest_daily, service="latest-daily", default_cell_method="point" +) +get_nearest_continuous = _timeseries_wrapper( + _get_nearest_continuous, service="continuous", default_cell_method="point" +) +get_peaks = _timeseries_wrapper( + _api.get_peaks, service="peaks", default_cell_method="maximum" +) +get_field_measurements = _field_wrapper( + _api.get_field_measurements, service="field-measurements" +) +get_stats_por = _stats_wrapper(_api.get_stats_por, service="statistics") +get_stats_date_range = _stats_wrapper(_api.get_stats_date_range, service="statistics") diff --git a/pyproject.toml b/pyproject.toml index 65b1ae68..2ffc823a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,9 @@ doc = [ nldi = [ 'geopandas>=0.10' ] +xarray = [ + 'xarray>=2023.1.0' +] [project.urls] homepage = "https://github.com/DOI-USGS/dataretrieval-python" diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py new file mode 100644 index 00000000..ffc961ea --- /dev/null +++ b/tests/waterdata_xarray_test.py @@ -0,0 +1,185 @@ +"""Offline unit tests for dataretrieval.waterdata.xarray converters. + +These exercise the pure DataFrame -> xarray.Dataset converters with synthetic +frames, so they run without network access. Live end-to-end behavior is +covered by the waterdata getters' own tests. +""" + +from types import SimpleNamespace + +import pandas as pd +import pytest + +xr = pytest.importorskip("xarray") +from dataretrieval.waterdata import xarray as wdx # noqa: E402 + + +def _meta(url="https://example.test/items"): + return SimpleNamespace(url=url) + + +def _daily_frame( + time_series_id="A", + site="USGS-1", + values=(100, 110), + times=("2024-06-01", "2024-06-02"), +): + n = len(values) + return pd.DataFrame( + { + "time": list(times), + "value": list(values), + "monitoring_location_id": [site] * n, + "parameter_code": ["00060"] * n, + "statistic_id": ["00003"] * n, + "unit_of_measure": ["ft^3/s"] * n, + "qualifier": [None] * n, + "approval_status": ["Approved"] * n, + "time_series_id": [time_series_id] * n, + } + ) + + +_DISCHARGE_META = { + "A": { + "parameter_code": "00060", + "parameter_name": "Discharge", + "parameter_description": "Discharge, cubic feet per second", + "unit_of_measure": "ft^3/s", + "statistic_id": "00003", + "computation_identifier": "Mean", + } +} + + +def test_build_timeseries_cf_attributes(): + ds = wdx._build_timeseries( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert isinstance(ds, xr.Dataset) + assert "discharge" in ds.data_vars + v = ds["discharge"] + assert v.attrs["long_name"] == "Discharge, cubic feet per second" + assert v.attrs["units"] == "ft3 s-1" # UDUNITS-normalized from "ft^3/s" + assert v.attrs["cell_methods"] == "time: mean" + assert v.attrs["standard_name"] == "water_volume_transport_in_river_channel" + assert v.attrs["usgs_parameter_code"] == "00060" + assert v.attrs["usgs_statistic_id"] == "00003" + # dataset-level provenance + assert ds.attrs["Conventions"] == "CF-1.11" + assert ds.attrs["featureType"] == "timeSeries" + assert ds.attrs["references"] == "https://example.test/items" + # site is the DSG instance dimension + assert ds["monitoring_location_id"].attrs.get("cf_role") == "timeseries_id" + assert ds.sizes["time"] == 2 + + +def test_ancillary_variables_linked(): + ds = wdx._build_timeseries( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "discharge_qualifier" in ds.data_vars + assert "discharge_approval_status" in ds.data_vars + assert ds["discharge"].attrs["ancillary_variables"] == ( + "discharge_qualifier discharge_approval_status" + ) + + +def test_unknown_unit_passes_through_verbatim(): + meta = {"A": dict(_DISCHARGE_META["A"], unit_of_measure="widgets/s")} + df = _daily_frame() + df["unit_of_measure"] = "widgets/s" + ds = wdx._build_timeseries(df, _meta(), service="daily", series_meta=meta) + assert ds["discharge"].attrs["units"] == "widgets/s" + + +def test_missing_standard_name_is_omitted(): + # parameter_code with no curated CF mapping -> no standard_name key + meta = { + "A": dict( + _DISCHARGE_META["A"], parameter_code="99999", parameter_name="Mystery" + ) + } + df = _daily_frame() + df["parameter_code"] = "99999" + ds = wdx._build_timeseries(df, _meta(), service="daily", series_meta=meta) + assert "standard_name" not in ds["mystery"].attrs + assert ds["mystery"].attrs["usgs_parameter_code"] == "99999" + + +def test_multiple_parameters_outer_join_on_time(): + # discharge at t1,t2 ; temperature at t2,t3 -> union time, NaN fill + q = _daily_frame(values=(100, 110), times=("2024-06-01", "2024-06-02")) + t = pd.DataFrame( + { + "time": ["2024-06-02", "2024-06-03"], + "value": [18.0, 19.0], + "monitoring_location_id": ["USGS-1", "USGS-1"], + "parameter_code": ["00010", "00010"], + "statistic_id": ["00011", "00011"], + "unit_of_measure": ["degC", "degC"], + "qualifier": [None, None], + "approval_status": ["Approved", "Approved"], + "time_series_id": ["B", "B"], + } + ) + meta = dict(_DISCHARGE_META) + meta["B"] = { + "parameter_code": "00010", + "parameter_name": "Temperature, water", + "unit_of_measure": "degC", + "statistic_id": "00011", + "computation_identifier": "Instantaneous", + } + ds = wdx._build_timeseries( + pd.concat([q, t]), _meta(), service="continuous", series_meta=meta + ) + assert {"discharge", "temperature_water"} <= set(ds.data_vars) + assert ds.sizes["time"] == 3 # union of {t1,t2,t3} + # temperature has no value at t1 -> NaN + assert pd.isna(ds["temperature_water"].sel(time="2024-06-01").item()) + assert ds["temperature_water"].attrs["cell_methods"] == "time: point" + + +def test_collision_falls_back_to_series_id_dim(): + # two distinct series share (site, parameter, statistic) AND a timestamp + a = _daily_frame(time_series_id="A", values=(100,), times=("2024-06-01",)) + b = _daily_frame(time_series_id="B", values=(200,), times=("2024-06-01",)) + meta = {"A": _DISCHARGE_META["A"], "B": _DISCHARGE_META["A"]} + with pytest.warns(UserWarning, match="instance dimension"): + ds = wdx._build_timeseries( + pd.concat([a, b]), _meta(), service="daily", series_meta=meta + ) + assert "time_series_id" in ds.dims + # the human-readable site is preserved as a coordinate + assert "monitoring_location_id" in ds.coords + assert ds.sizes["time_series_id"] == 2 + + +def test_empty_frame_returns_dataset_with_conventions(): + ds = wdx._build_timeseries(pd.DataFrame(), _meta(), service="daily", series_meta={}) + assert isinstance(ds, xr.Dataset) + assert list(ds.data_vars) == [] + assert ds.attrs["Conventions"] == "CF-1.11" + + +def test_build_stats_flat_dataset(): + df = pd.DataFrame( + { + "monitoring_location_id": ["USGS-1", "USGS-1"], + "parameter_code": ["00060", "00060"], + "month": [1, 2], + "p50_va": [120.0, 130.0], + } + ) + ds = wdx._build_stats(df, _meta(), "statistics") + assert isinstance(ds, xr.Dataset) + assert "p50_va" in ds.data_vars + assert ds.attrs["Conventions"] == "CF-1.11" + + +def test_public_wrappers_exist_and_are_documented(): + for name in wdx.__all__: + fn = getattr(wdx, name) + assert callable(fn) + assert "xarray wrapper" in (fn.__doc__ or "") From 6bc2b269384466641131bc37644239c2922c1776 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 21:40:10 -0500 Subject: [PATCH 10/24] perf(waterdata.xarray): build datasets from the lean hash-free frame The wrappers previously forced include_hash=True purely to keep a join key, which materialized the per-record UUID column (a unique ~36-char string per row -- e.g. continuous_id measured at ~155 KB / 1825 unique values on a 20-day continuous pull) only to discard it. Drop that. The wrappers now fetch the underlying getter's default, hash-free output, so the heavy UUID column is never requested or built. Every CF attribute is derived from columns already present: units from unit_of_measure, cell_methods from statistic_id (new _STATISTIC_CELL_METHOD map), standard_name/usgs_parameter_code from parameter_code. Only the human-readable parameter name still needs the metadata endpoint, now looked up (and cached) keyed by parameter_code rather than a series hash. time and monitoring_location_id are the coordinates; the converter slims to just the columns it pivots before copying. Without the hash key the rare two-series-per-(site, parameter, statistic) collision can no longer be split (the values are inherently ambiguous without it), so it now keeps the first value and warns instead of switching the instance dimension. Tests updated accordingly. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/xarray.py | 200 ++++++++++++++---------------- tests/waterdata_xarray_test.py | 53 ++++---- 2 files changed, 118 insertions(+), 135 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 8d81bf87..5b7ff6d9 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -17,9 +17,12 @@ * dataset-level attributes carry ``Conventions``, provenance, and the request URL. -The wrappers call the underlying getter with ``include_hash=True`` so the -join key survives, perform the metadata lookup, then drop the opaque hash -columns from the user-facing result. +The wrappers call the underlying getter with its default, hash-free output +-- so the large per-record UUID column is never fetched or materialized -- +and derive the CF attributes directly from the surviving columns +(``unit_of_measure`` -> ``units``, ``statistic_id`` -> ``cell_methods``, +``parameter_code`` -> ``standard_name``). Only the human-readable parameter +name comes from a small, cached metadata lookup keyed by ``parameter_code``. This module requires the optional ``xarray`` dependency:: @@ -79,15 +82,16 @@ "%": "percent", } -# computation_identifier -> the operator in a CF ``cell_methods`` string. -_CELL_METHOD = { - "Mean": "mean", - "Sum": "sum", - "Maximum": "maximum", - "Max At Event Time": "maximum", - "Minimum": "minimum", - "Median": "median", - "Instantaneous": "point", +# USGS statistic_id -> the operator in a CF ``cell_methods`` string. Read +# straight from the values frame, so no metadata round-trip is needed to +# classify the aggregation. +_STATISTIC_CELL_METHOD = { + "00001": "maximum", + "00002": "minimum", + "00003": "mean", + "00006": "sum", + "00008": "median", + "00011": "point", # instantaneous } # USGS 5-digit parameter code -> CF standard_name. Deliberately conservative; @@ -109,38 +113,31 @@ _TS_META_CACHE: dict[str, dict[str, dict]] = {} _FIELD_META_CACHE: dict[str, dict[str, dict]] = {} -_TS_DESCRIPTORS = ( - "parameter_code", - "parameter_name", - "parameter_description", - "unit_of_measure", - "statistic_id", - "computation_period_identifier", - "computation_identifier", -) -_FIELD_DESCRIPTORS = ("parameter_code", "parameter_name", "parameter_description") +# Only the human-readable name is sourced from the metadata endpoint; units, +# statistic, and parameter code all come from the values frame itself. +_NAME_DESCRIPTORS = ("parameter_name", "parameter_description") -def _lookup(site_ids, cache, getter, id_col, descriptors): - """Fetch and cache metadata descriptors keyed by the series id column. +def _lookup(site_ids, cache, getter): + """Fetch and cache ``{parameter_code: {name descriptors}}`` for sites. - Returns a flat ``{series_id: {descriptor: value}}`` dict covering every - requested site. One network call per not-yet-cached batch of sites; the - cache is keyed by site so repeated getter calls reuse it. + Keyed by ``parameter_code`` (stable and 1:1 with the parameter name), so + the lookup needs no hash id. One metadata call per not-yet-cached batch + of sites; the cache is keyed by site so repeated getter calls reuse it. """ sites = sorted({str(s) for s in site_ids if _pd.notna(s)}) todo = [s for s in sites if s not in cache] if todo: - meta, _ = getter(monitoring_location_id=todo, include_hash=True) + meta, _ = getter(monitoring_location_id=todo) for s in todo: cache[s] = {} - if not meta.empty: - cols = [c for c in descriptors if c in meta.columns] + if not meta.empty and "parameter_code" in meta.columns: + cols = [c for c in _NAME_DESCRIPTORS if c in meta.columns] for _, row in meta.iterrows(): site = row.get("monitoring_location_id") - sid = row.get(id_col) - if site in cache and _pd.notna(sid): - cache[site][sid] = {c: row.get(c) for c in cols} + pcode = row.get("parameter_code") + if site in cache and _pd.notna(pcode): + cache[site][str(pcode)] = {c: row.get(c) for c in cols} out: dict[str, dict] = {} for s in sites: out.update(cache.get(s, {})) @@ -148,23 +145,11 @@ def _lookup(site_ids, cache, getter, id_col, descriptors): def _timeseries_metadata(site_ids): - return _lookup( - site_ids, - _TS_META_CACHE, - _api.get_time_series_metadata, - "time_series_id", - _TS_DESCRIPTORS, - ) + return _lookup(site_ids, _TS_META_CACHE, _api.get_time_series_metadata) def _field_metadata(site_ids): - return _lookup( - site_ids, - _FIELD_META_CACHE, - _api.get_field_measurements_metadata, - "field_series_id", - _FIELD_DESCRIPTORS, - ) + return _lookup(site_ids, _FIELD_META_CACHE, _api.get_field_measurements_metadata) # --- helpers --------------------------------------------------------------- @@ -182,20 +167,26 @@ def _first(series): return nonnull.iloc[0] if len(nonnull) else None -def _var_attrs(desc, group, *, default_cell_method, pcode, ancillary, name): - """Build the CF attribute dict for one data variable.""" +def _var_attrs(desc, *, unit, pcode, stat, default_cell_method, ancillary, name): + """Build the CF attribute dict for one data variable. + + ``unit``, ``pcode`` and ``stat`` are read from the values frame; ``desc`` + supplies only the human-readable name from the metadata lookup. + """ attrs: dict[str, str] = {} long_name = desc.get("parameter_description") or desc.get("parameter_name") if long_name and _pd.notna(long_name): attrs["long_name"] = str(long_name) - unit = desc.get("unit_of_measure") - if (unit is None or _pd.isna(unit)) and "unit_of_measure" in group: - unit = _first(group["unit_of_measure"]) if unit is not None and _pd.notna(unit): attrs["units"] = _UDUNITS.get(str(unit), str(unit)) - op = _CELL_METHOD.get(desc.get("computation_identifier"), default_cell_method) + op = ( + _STATISTIC_CELL_METHOD.get(str(stat)) + if stat is not None and _pd.notna(stat) + else None + ) + op = op or default_cell_method if op: attrs["cell_methods"] = f"time: {op}" @@ -205,9 +196,6 @@ def _var_attrs(desc, group, *, default_cell_method, pcode, ancillary, name): attrs["standard_name"] = sn attrs["usgs_parameter_code"] = str(pcode) - stat = desc.get("statistic_id") - if (stat is None or _pd.isna(stat)) and "statistic_id" in group: - stat = _first(group["statistic_id"]) if stat is not None and _pd.notna(stat): attrs["usgs_statistic_id"] = str(stat) @@ -254,69 +242,77 @@ def _point_coords(df, inst): return lon, lat +_INSTANCE = "monitoring_location_id" + + def _build_timeseries( df, base_meta, *, service, series_meta, - key_col="time_series_id", group_cols=("parameter_code", "statistic_id"), default_cell_method=None, ): - """Long values frame -> CF timeSeries Dataset (one var per parameter).""" + """Hash-free values frame -> CF timeSeries Dataset (one var per parameter). + + The frame carries no hash columns (the wrappers fetch the default lean + output); every CF attribute is derived from ``parameter_code`` / + ``statistic_id`` / ``unit_of_measure`` already present, plus the + human-readable name from ``series_meta`` keyed by ``parameter_code``. + ``time`` and ``monitoring_location_id`` become the coordinates. + """ if df is None or len(df) == 0: return _empty_dataset(service, base_meta) - df = df.copy() - # Normalize to naive-UTC so xarray can store datetime64 (it has no tz dtype). - times = _pd.to_datetime(df["time"], errors="coerce", utc=True) - df["time"] = times.dt.tz_localize(None) - df["value"] = _pd.to_numeric(df["value"], errors="coerce") group_cols = [c for c in group_cols if c in df.columns] - - # Instance (DSG) dimension: the site, unless two series collide on - # (site, parameter, time) -- then fall back to the unambiguous series id. - inst = "monitoring_location_id" - if key_col in df.columns and df.duplicated(group_cols + [inst, "time"]).any(): - inst = key_col - _warnings.warn( - "multiple time series share a (site, parameter); using " - f"'{key_col}' as the instance dimension instead of " - "'monitoring_location_id'.", - stacklevel=3, - ) + ancillary = [c for c in _ANCILLARY if c in df.columns] + has_unit = "unit_of_measure" in df.columns + # Slim to just the columns we convert, so the heavy frame (and any columns + # we ignore) is not copied wholesale. + cols = [_INSTANCE, "time", "value", *group_cols, *ancillary] + if has_unit: + cols.append("unit_of_measure") + work = df.loc[:, list(dict.fromkeys(cols))].copy() + # Normalize to naive-UTC so xarray can store datetime64 (it has no tz dtype). + work["time"] = _pd.to_datetime( + work["time"], errors="coerce", utc=True + ).dt.tz_localize(None) + work["value"] = _pd.to_numeric(work["value"], errors="coerce") datasets, used = [], set() - for _, group in df.groupby(group_cols, dropna=False): - sid = _first(group[key_col]) if key_col in group else None - desc = series_meta.get(sid, {}) if sid is not None else {} - pcode = desc.get("parameter_code") or ( - _first(group["parameter_code"]) if "parameter_code" in group else None - ) + for _, group in work.groupby(group_cols, dropna=False): + pcode = _first(group["parameter_code"]) if "parameter_code" in group else None + stat = _first(group["statistic_id"]) if "statistic_id" in group else None + unit = _first(group["unit_of_measure"]) if has_unit else None + desc = series_meta.get(str(pcode), {}) if pcode is not None else {} name = _slug(desc.get("parameter_name") or pcode) - if name in used: # disambiguate same-parameter, different-statistic vars - comp = desc.get("computation_identifier") or _first( - group.get("statistic_id", _pd.Series(dtype=object)) - ) - name = f"{name}_{_slug(comp)}" if comp else name + if name in used: # same parameter, different statistic -> distinct var + op = _STATISTIC_CELL_METHOD.get(str(stat)) or (str(stat) if stat else None) + name = f"{name}_{_slug(op)}" if op else name while name in used: name += "_x" used.add(name) - ancillary = [c for c in _ANCILLARY if c in group.columns] - sub = group.set_index([inst, "time"])[["value", *ancillary]] + sub = group.set_index([_INSTANCE, "time"])[["value", *ancillary]] if not sub.index.is_unique: + _warnings.warn( + f"'{name}' has multiple values per (site, time) -- two series " + "share this (site, parameter, statistic); keeping the first. " + "Filter the query to separate them.", + stacklevel=3, + ) sub = sub[~sub.index.duplicated(keep="first")] ds_g = sub.to_xarray().rename( {"value": name, **{c: f"{name}_{c}" for c in ancillary}} ) ds_g[name].attrs = _var_attrs( desc, - group, - default_cell_method=default_cell_method, + unit=unit, pcode=pcode, + stat=stat, + default_cell_method=default_cell_method, ancillary=ancillary, name=name, ) @@ -327,26 +323,20 @@ def _build_timeseries( ds = _xr.merge(datasets, combine_attrs="drop_conflicts", join="outer") ds.attrs = _dataset_attrs(service, base_meta) ds["time"].attrs.setdefault("standard_name", "time") - if inst in ds.coords: - ds[inst].attrs.setdefault("cf_role", "timeseries_id") + if _INSTANCE in ds.coords: + ds[_INSTANCE].attrs.setdefault("cf_role", "timeseries_id") - coords = _point_coords(df, inst) + coords = _point_coords(df, _INSTANCE) if coords is not None: lon, lat = coords - order = list(ds[inst].values) + order = list(ds[_INSTANCE].values) ds = ds.assign_coords( - longitude=(inst, [lon.get(k) for k in order]), - latitude=(inst, [lat.get(k) for k in order]), + longitude=(_INSTANCE, [lon.get(k) for k in order]), + latitude=(_INSTANCE, [lat.get(k) for k in order]), ) ds["longitude"].attrs = {"standard_name": "longitude", "units": "degrees_east"} ds["latitude"].attrs = {"standard_name": "latitude", "units": "degrees_north"} - # Surface the human-readable site as a coordinate when the id is the dim. - if inst == key_col and "monitoring_location_id" in df.columns: - sites = df.drop_duplicates(key_col).set_index(key_col)["monitoring_location_id"] - ds = ds.assign_coords( - monitoring_location_id=(inst, sites.reindex(ds[inst].values).values) - ) return ds @@ -390,7 +380,8 @@ def _xr_doc(func): def _timeseries_wrapper(func, *, service, default_cell_method=None): @_wraps(func) def wrapper(*args, **kwargs): - kwargs.setdefault("include_hash", True) + # Default (hash-free) fetch: the per-record UUID is never requested or + # materialized; CF attributes come from the surviving columns. df, base_meta = func(*args, **kwargs) return _build_timeseries( df, @@ -407,14 +398,12 @@ def wrapper(*args, **kwargs): def _field_wrapper(func, *, service): @_wraps(func) def wrapper(*args, **kwargs): - kwargs.setdefault("include_hash", True) df, base_meta = func(*args, **kwargs) return _build_timeseries( df, base_meta, service=service, series_meta=_field_metadata(_sites(df)), - key_col="field_measurements_series_id", group_cols=("parameter_code",), default_cell_method="point", ) @@ -426,7 +415,6 @@ def wrapper(*args, **kwargs): def _stats_wrapper(func, *, service): @_wraps(func) def wrapper(*args, **kwargs): - kwargs.setdefault("include_hash", True) df, base_meta = func(*args, **kwargs) return _build_stats(df, base_meta, service) diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index ffc961ea..0677da93 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -40,14 +40,12 @@ def _daily_frame( ) +# series_meta is keyed by parameter_code and supplies only the readable name; +# units/statistic/parameter_code come from the values frame itself. _DISCHARGE_META = { - "A": { - "parameter_code": "00060", + "00060": { "parameter_name": "Discharge", "parameter_description": "Discharge, cubic feet per second", - "unit_of_measure": "ft^3/s", - "statistic_id": "00003", - "computation_identifier": "Mean", } } @@ -86,20 +84,17 @@ def test_ancillary_variables_linked(): def test_unknown_unit_passes_through_verbatim(): - meta = {"A": dict(_DISCHARGE_META["A"], unit_of_measure="widgets/s")} df = _daily_frame() - df["unit_of_measure"] = "widgets/s" - ds = wdx._build_timeseries(df, _meta(), service="daily", series_meta=meta) + df["unit_of_measure"] = "widgets/s" # units are read from the frame + ds = wdx._build_timeseries( + df, _meta(), service="daily", series_meta=_DISCHARGE_META + ) assert ds["discharge"].attrs["units"] == "widgets/s" def test_missing_standard_name_is_omitted(): # parameter_code with no curated CF mapping -> no standard_name key - meta = { - "A": dict( - _DISCHARGE_META["A"], parameter_code="99999", parameter_name="Mystery" - ) - } + meta = {"99999": {"parameter_name": "Mystery", "parameter_description": "Mystery"}} df = _daily_frame() df["parameter_code"] = "99999" ds = wdx._build_timeseries(df, _meta(), service="daily", series_meta=meta) @@ -124,12 +119,9 @@ def test_multiple_parameters_outer_join_on_time(): } ) meta = dict(_DISCHARGE_META) - meta["B"] = { - "parameter_code": "00010", + meta["00010"] = { "parameter_name": "Temperature, water", - "unit_of_measure": "degC", - "statistic_id": "00011", - "computation_identifier": "Instantaneous", + "parameter_description": "Temperature, water, degrees Celsius", } ds = wdx._build_timeseries( pd.concat([q, t]), _meta(), service="continuous", series_meta=meta @@ -138,22 +130,25 @@ def test_multiple_parameters_outer_join_on_time(): assert ds.sizes["time"] == 3 # union of {t1,t2,t3} # temperature has no value at t1 -> NaN assert pd.isna(ds["temperature_water"].sel(time="2024-06-01").item()) + # cell_methods derived from statistic_id 00011 (instantaneous) -> point assert ds["temperature_water"].attrs["cell_methods"] == "time: point" -def test_collision_falls_back_to_series_id_dim(): - # two distinct series share (site, parameter, statistic) AND a timestamp - a = _daily_frame(time_series_id="A", values=(100,), times=("2024-06-01",)) - b = _daily_frame(time_series_id="B", values=(200,), times=("2024-06-01",)) - meta = {"A": _DISCHARGE_META["A"], "B": _DISCHARGE_META["A"]} - with pytest.warns(UserWarning, match="instance dimension"): +def test_collision_dedups_with_warning(): + # two values for the same (site, parameter, statistic, time) are ambiguous + # without the hash key -> keep the first and warn; site stays the dim. + a = _daily_frame(values=(100,), times=("2024-06-01",)) + b = _daily_frame(values=(200,), times=("2024-06-01",)) + with pytest.warns(UserWarning, match="multiple values per"): ds = wdx._build_timeseries( - pd.concat([a, b]), _meta(), service="daily", series_meta=meta + pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META ) - assert "time_series_id" in ds.dims - # the human-readable site is preserved as a coordinate - assert "monitoring_location_id" in ds.coords - assert ds.sizes["time_series_id"] == 2 + assert "monitoring_location_id" in ds.dims + assert ds.sizes["time"] == 1 + assert ( + ds["discharge"].sel(monitoring_location_id="USGS-1", time="2024-06-01").item() + == 100 + ) def test_empty_frame_returns_dataset_with_conventions(): From 6df10c4aad07cf75866579ca3ada6e9c9fab2421 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 22:48:03 -0500 Subject: [PATCH 11/24] refactor(waterdata.xarray): move CF vocabulary maps to types The CF lookup tables (USGS units -> UDUNITS, statistic_id -> cell_methods operator, parameter_code -> standard_name) are plain reference data, so move them out of the converter module into types.py as public, extensible constants (CF_UNIT_MAP, CF_CELL_METHODS, CF_STANDARD_NAMES) alongside the existing PROFILE_LOOKUP. They carry no xarray dependency, so types.py stays import-light and the tables can be extended without importing the xarray-optional converter. xarray.py imports them at the top; behavior is unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/types.py | 44 ++++++++++++++++++++++++ dataretrieval/waterdata/xarray.py | 56 +++++-------------------------- 2 files changed, 53 insertions(+), 47 deletions(-) diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py index f5e1496b..eb4d0a27 100644 --- a/dataretrieval/waterdata/types.py +++ b/dataretrieval/waterdata/types.py @@ -74,3 +74,47 @@ "count", ], } + + +# --- CF / xarray vocabulary mappings --------------------------------------- +# Lookup tables used by :mod:`dataretrieval.waterdata.xarray` to translate +# USGS terms into CF-conventions metadata. Each is intentionally partial: +# anything not listed falls back to a sensible default (raw unit string kept +# verbatim; no standard_name emitted) rather than guessing a wrong CF term. +# They are plain data, so they live here rather than in the (xarray-optional) +# converter module and can be extended without importing xarray. + +# USGS unit strings -> UDUNITS / CF-canonical form. +CF_UNIT_MAP = { + "ft^3/s": "ft3 s-1", + "ft3/s": "ft3 s-1", + "ft": "ft", + "in": "in", + "degC": "degC", + "deg C": "degC", + "uS/cm": "uS/cm", + "mg/l": "mg L-1", + "mg/L": "mg L-1", + "tons/day": "short_ton day-1", + "%": "percent", +} + +# USGS statistic_id -> the operator in a CF ``cell_methods`` string. +CF_CELL_METHODS = { + "00001": "maximum", + "00002": "minimum", + "00003": "mean", + "00006": "sum", + "00008": "median", + "00011": "point", # instantaneous +} + +# USGS 5-digit parameter code -> CF standard_name. Deliberately conservative; +# codes without a confident match are left without a standard_name. +CF_STANDARD_NAMES = { + "00060": "water_volume_transport_in_river_channel", + "00010": "water_temperature", + "00065": "water_surface_height_above_reference_datum", + "63160": "water_surface_height_above_reference_datum", + "00045": "lwe_thickness_of_precipitation_amount", +} diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 5b7ff6d9..6be06d5a 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -48,6 +48,7 @@ from . import api as _api from .nearest import get_nearest_continuous as _get_nearest_continuous +from .types import CF_CELL_METHODS, CF_STANDARD_NAMES, CF_UNIT_MAP __all__ = [ "get_continuous", @@ -62,47 +63,10 @@ ] -# --- CF mapping tables ----------------------------------------------------- -# Each is intentionally partial: anything not listed falls back to a sensible -# default (raw unit string kept verbatim; no standard_name emitted) rather -# than guessing and emitting a wrong CF term. - -# USGS unit strings -> UDUNITS / CF-canonical form. -_UDUNITS = { - "ft^3/s": "ft3 s-1", - "ft3/s": "ft3 s-1", - "ft": "ft", - "in": "in", - "degC": "degC", - "deg C": "degC", - "uS/cm": "uS/cm", - "mg/l": "mg L-1", - "mg/L": "mg L-1", - "tons/day": "short_ton day-1", - "%": "percent", -} - -# USGS statistic_id -> the operator in a CF ``cell_methods`` string. Read -# straight from the values frame, so no metadata round-trip is needed to -# classify the aggregation. -_STATISTIC_CELL_METHOD = { - "00001": "maximum", - "00002": "minimum", - "00003": "mean", - "00006": "sum", - "00008": "median", - "00011": "point", # instantaneous -} - -# USGS 5-digit parameter code -> CF standard_name. Deliberately conservative; -# codes without a confident match are left without a standard_name. -_STANDARD_NAME = { - "00060": "water_volume_transport_in_river_channel", - "00010": "water_temperature", - "00065": "water_surface_height_above_reference_datum", - "63160": "water_surface_height_above_reference_datum", - "00045": "lwe_thickness_of_precipitation_amount", -} +# The CF vocabulary lookups (USGS units -> UDUNITS, statistic_id -> +# cell_methods operator, parameter_code -> standard_name) are plain data and +# live in ``types`` -- imported as CF_UNIT_MAP / CF_CELL_METHODS / +# CF_STANDARD_NAMES at the top of this module. # Columns kept off the value pivot but surfaced as ancillary (flag) variables. _ANCILLARY = ("qualifier", "approval_status") @@ -179,19 +143,17 @@ def _var_attrs(desc, *, unit, pcode, stat, default_cell_method, ancillary, name) attrs["long_name"] = str(long_name) if unit is not None and _pd.notna(unit): - attrs["units"] = _UDUNITS.get(str(unit), str(unit)) + attrs["units"] = CF_UNIT_MAP.get(str(unit), str(unit)) op = ( - _STATISTIC_CELL_METHOD.get(str(stat)) - if stat is not None and _pd.notna(stat) - else None + CF_CELL_METHODS.get(str(stat)) if stat is not None and _pd.notna(stat) else None ) op = op or default_cell_method if op: attrs["cell_methods"] = f"time: {op}" if pcode is not None and _pd.notna(pcode): - sn = _STANDARD_NAME.get(str(pcode)) + sn = CF_STANDARD_NAMES.get(str(pcode)) if sn: attrs["standard_name"] = sn attrs["usgs_parameter_code"] = str(pcode) @@ -289,7 +251,7 @@ def _build_timeseries( name = _slug(desc.get("parameter_name") or pcode) if name in used: # same parameter, different statistic -> distinct var - op = _STATISTIC_CELL_METHOD.get(str(stat)) or (str(stat) if stat else None) + op = CF_CELL_METHODS.get(str(stat)) or (str(stat) if stat else None) name = f"{name}_{_slug(op)}" if op else name while name in used: name += "_x" From 140a3c43dba4186149789ae01d6ee10e10b079bb Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 25 May 2026 07:48:05 -0500 Subject: [PATCH 12/24] feat(waterdata.xarray): always drop hash IDs; ignore include_hash The xarray path never surfaces hash columns (neither the per-record OGC feature id nor the per-series join key), so route every wrapper through a _fetch helper that pops include_hash before calling the underlying getter. This makes the flag inert in the xarray path -- passing include_hash=True no longer triggers a fetch of a column we'd only discard -- and documents that hash IDs are always omitted here. The pandas getters keep include_hash as the opt-in flag; only the xarray layer drops them unconditionally. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/xarray.py | 23 +++++++++++++++++------ tests/waterdata_xarray_test.py | 17 +++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 6be06d5a..d79c3a83 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -329,12 +329,25 @@ def _sites(df): return [] +def _fetch(func, args, kwargs): + """Call the underlying getter with hash IDs forced off. + + The xarray path never surfaces hash columns (neither the per-record + UUID nor the per-series join key), so ``include_hash`` is dropped here: + passing it has no effect and we avoid fetching a column we'd discard. + """ + kwargs.pop("include_hash", None) + return func(*args, **kwargs) + + def _xr_doc(func): """Prepend an xarray note to the wrapped getter's docstring.""" note = ( " xarray wrapper: same arguments as " f"``dataretrieval.waterdata.{func.__name__}``, but returns a\n" - " CF-conventions ``xarray.Dataset`` with series metadata populated.\n\n" + " CF-conventions ``xarray.Dataset`` with series metadata populated.\n" + " Hash-valued ID columns are always omitted here; the\n" + " ``include_hash`` flag does not apply.\n\n" ) return note + (func.__doc__ or "") @@ -342,9 +355,7 @@ def _xr_doc(func): def _timeseries_wrapper(func, *, service, default_cell_method=None): @_wraps(func) def wrapper(*args, **kwargs): - # Default (hash-free) fetch: the per-record UUID is never requested or - # materialized; CF attributes come from the surviving columns. - df, base_meta = func(*args, **kwargs) + df, base_meta = _fetch(func, args, kwargs) return _build_timeseries( df, base_meta, @@ -360,7 +371,7 @@ def wrapper(*args, **kwargs): def _field_wrapper(func, *, service): @_wraps(func) def wrapper(*args, **kwargs): - df, base_meta = func(*args, **kwargs) + df, base_meta = _fetch(func, args, kwargs) return _build_timeseries( df, base_meta, @@ -377,7 +388,7 @@ def wrapper(*args, **kwargs): def _stats_wrapper(func, *, service): @_wraps(func) def wrapper(*args, **kwargs): - df, base_meta = func(*args, **kwargs) + df, base_meta = _fetch(func, args, kwargs) return _build_stats(df, base_meta, service) wrapper.__doc__ = _xr_doc(func) diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index 0677da93..b258e279 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -178,3 +178,20 @@ def test_public_wrappers_exist_and_are_documented(): fn = getattr(wdx, name) assert callable(fn) assert "xarray wrapper" in (fn.__doc__ or "") + + +def test_fetch_strips_include_hash(): + # The xarray path never surfaces hash columns, so include_hash must be + # dropped before the underlying getter is called (no wasted fetch). + captured = {} + + def fake_getter(*args, **kwargs): + captured.update(kwargs) + return "df", "meta" + + df, meta = wdx._fetch( + fake_getter, (), {"include_hash": True, "parameter_code": "00060"} + ) + assert (df, meta) == ("df", "meta") + assert "include_hash" not in captured + assert captured == {"parameter_code": "00060"} From ad42b52d40ecca71ec535c800e92c85927b5549f Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 25 May 2026 08:00:47 -0500 Subject: [PATCH 13/24] fix(waterdata.xarray): accurate stats docstring; pin wrapper->_fetch routing Addresses two review gaps: - The shared wrapper docstring promised "a CF-conventions Dataset with series metadata populated", which is false for the stats wrappers -- _build_stats emits a flat, preliminary Dataset with only dataset-level provenance. Parametrize _xr_doc(cf_metadata=...) so get_stats_por/get_stats_date_range describe their actual output. - The include_hash strip was only tested on _fetch in isolation; nothing pinned the wrappers to route through it. Add a test that monkeypatches _fetch and asserts every public wrapper delegates to it (and forwards include_hash for _fetch to drop), guarding against a wrapper reverting to a direct getter call. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/xarray.py | 25 ++++++++++++++++++------- tests/waterdata_xarray_test.py | 21 +++++++++++++++++++++ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index d79c3a83..de44cec2 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -340,14 +340,25 @@ def _fetch(func, args, kwargs): return func(*args, **kwargs) -def _xr_doc(func): - """Prepend an xarray note to the wrapped getter's docstring.""" +def _xr_doc(func, *, cf_metadata=True): + """Prepend an xarray note to the wrapped getter's docstring. + + ``cf_metadata=False`` describes the preliminary stats path, which emits a + flat Dataset without per-variable CF attributes. + """ + returns = ( + "a CF-conventions ``xarray.Dataset`` with series metadata populated" + if cf_metadata + else ( + "a preliminary, flat ``xarray.Dataset`` (dataset-level provenance " + "only; per-variable CF metadata is not yet populated)" + ) + ) note = ( " xarray wrapper: same arguments as " - f"``dataretrieval.waterdata.{func.__name__}``, but returns a\n" - " CF-conventions ``xarray.Dataset`` with series metadata populated.\n" - " Hash-valued ID columns are always omitted here; the\n" - " ``include_hash`` flag does not apply.\n\n" + f"``dataretrieval.waterdata.{func.__name__}``, but returns\n" + f" {returns}. Hash-valued ID columns are always omitted here;\n" + " the ``include_hash`` flag does not apply.\n\n" ) return note + (func.__doc__ or "") @@ -391,7 +402,7 @@ def wrapper(*args, **kwargs): df, base_meta = _fetch(func, args, kwargs) return _build_stats(df, base_meta, service) - wrapper.__doc__ = _xr_doc(func) + wrapper.__doc__ = _xr_doc(func, cf_metadata=False) return wrapper diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index b258e279..c804412e 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -195,3 +195,24 @@ def fake_getter(*args, **kwargs): assert (df, meta) == ("df", "meta") assert "include_hash" not in captured assert captured == {"parameter_code": "00060"} + + +def test_every_wrapper_routes_through_fetch(monkeypatch): + # Pin the wiring: each public wrapper must delegate the underlying fetch to + # _fetch (which is what strips include_hash). Guards against a wrapper + # quietly reverting to calling the getter directly and leaking the flag. + seen = [] + + def spy(func, args, kwargs): + seen.append(dict(kwargs)) + return pd.DataFrame(), SimpleNamespace(url=None) # empty -> no network + + monkeypatch.setattr(wdx, "_fetch", spy) + for name in wdx.__all__: + seen.clear() + ds = getattr(wdx, name)(monitoring_location_id="USGS-1", include_hash=True) + assert isinstance(ds, xr.Dataset) + assert len(seen) == 1, f"{name} did not route through _fetch" + # the wrapper hands include_hash to _fetch; _fetch is what drops it + # (asserted in test_fetch_strips_include_hash) + assert seen[0].get("include_hash") is True From 700120e137fb5187aa7944a29429996d1fe83b53 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 25 May 2026 08:04:57 -0500 Subject: [PATCH 14/24] fix(waterdata.xarray): handle NaT times and mixed units in the converter Two correctness fixes in _build_timeseries from review: - NaT time: a row whose time fails to parse (errors="coerce" -> NaT) could not index the array and was silently swallowed by to_xarray (and an all-NaT group crashed). Drop such rows explicitly with a warning, and return an empty Dataset if nothing parseable remains -- no more silent data loss. - Mixed units: unit_of_measure was collapsed with _first over a (parameter_code, statistic_id) group that can span multiple units (e.g. the same parameter reported in different units across sites). A variable can carry only one units attr, so warn when a group spans multiple units instead of silently labeling every value with the first. Adds tests for the bad-time drop, all-bad-time empty result, and the mixed-unit warning. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/xarray.py | 27 ++++++++++++++++++++- tests/waterdata_xarray_test.py | 39 +++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index de44cec2..87964392 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -242,11 +242,26 @@ def _build_timeseries( ).dt.tz_localize(None) work["value"] = _pd.to_numeric(work["value"], errors="coerce") + # A NaT time can't index the array and would be silently swallowed by + # ``to_xarray`` (or crash an all-NaT group). Drop such rows explicitly and + # say so, rather than losing observations without a trace. + n_before = len(work) + work = work[work["time"].notna()] + dropped = n_before - len(work) + if dropped: + _warnings.warn( + f"dropped {dropped} row(s) with an unparseable or missing time.", + stacklevel=3, + ) + if work.empty: + return _empty_dataset(service, base_meta) + datasets, used = [], set() for _, group in work.groupby(group_cols, dropna=False): pcode = _first(group["parameter_code"]) if "parameter_code" in group else None stat = _first(group["statistic_id"]) if "statistic_id" in group else None - unit = _first(group["unit_of_measure"]) if has_unit else None + group_units = group["unit_of_measure"].dropna().unique() if has_unit else () + unit = group_units[0] if len(group_units) else None desc = series_meta.get(str(pcode), {}) if pcode is not None else {} name = _slug(desc.get("parameter_name") or pcode) @@ -257,6 +272,16 @@ def _build_timeseries( name += "_x" used.add(name) + if len(group_units) > 1: + # One variable can carry only one ``units`` attr; surface the + # mix instead of silently labeling every value with the first. + _warnings.warn( + f"'{name}' spans multiple units {list(group_units)}; labeling " + f"with '{unit}'. Filter by site/parameter to avoid mixing " + "units in one variable.", + stacklevel=3, + ) + sub = group.set_index([_INSTANCE, "time"])[["value", *ancillary]] if not sub.index.is_unique: _warnings.warn( diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index c804412e..33fd2675 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -216,3 +216,42 @@ def spy(func, args, kwargs): # the wrapper hands include_hash to _fetch; _fetch is what drops it # (asserted in test_fetch_strips_include_hash) assert seen[0].get("include_hash") is True + + +def test_unparseable_time_dropped_with_warning(): + # A bad/missing time must be dropped explicitly (with a warning), not + # silently swallowed by to_xarray. + df = _daily_frame( + values=(100, 110, 120), + times=("2024-06-01", "not-a-date", "2024-06-03"), + ) + with pytest.warns(UserWarning, match="unparseable or missing time"): + ds = wdx._build_timeseries( + df, _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert ds.sizes["time"] == 2 # the bad-time row is gone, the good ones stay + assert 110 not in ds["discharge"].values # the dropped value did not survive + + +def test_all_unparseable_time_returns_empty_dataset(): + df = _daily_frame(values=(1, 2), times=("bad-a", "bad-b")) + with pytest.warns(UserWarning, match="unparseable or missing time"): + ds = wdx._build_timeseries( + df, _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert list(ds.data_vars) == [] + assert ds.attrs["Conventions"] == "CF-1.11" + + +def test_mixed_units_in_one_variable_warns(): + # Same (parameter, statistic) across two sites but different units -> one + # variable can hold only one units attr; warn instead of silently mislabeling. + a = _daily_frame(site="USGS-1", values=(100,), times=("2024-06-01",)) + b = _daily_frame(site="USGS-2", values=(3,), times=("2024-06-01",)) + b["unit_of_measure"] = "m3 s-1" + with pytest.warns(UserWarning, match="spans multiple units"): + ds = wdx._build_timeseries( + pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert ds.sizes["monitoring_location_id"] == 2 + assert ds["discharge"].attrs["units"] == "ft3 s-1" # first unit (ft^3/s) From 0c921f230d6c8f39fd11000892c5d9ffd71f80d9 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 27 May 2026 07:50:09 -0500 Subject: [PATCH 15/24] feat(waterdata.xarray): add lon/lat, site descriptors, and date_modified Enrich the CF datasets with identity/metadata already present in calls the converter makes, so no new upstream requests are introduced: * lon/lat: read point geometry from either a shapely Point or the flattened [lon, lat] list returned when geopandas is absent. Previously the .x/.y access raised AttributeError and the spatial coordinates were silently dropped in the common (no-geopandas) install. * date_modified: surface the most recent last_modified across the frame as the dataset-level ACDD date_modified, so a reader knows how current the pull is. * hydrologic_unit_code / state_name: add as instance-indexed auxiliary coordinates, sourced from the time-series-metadata call already made. Station name/altitude/agency live only on the monitoring-locations endpoint (a separate call) and are intentionally not fetched. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/xarray.py | 144 ++++++++++++++++++++++++------ tests/waterdata_xarray_test.py | 85 ++++++++++++++++++ 2 files changed, 202 insertions(+), 27 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 87964392..86af2e66 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -13,9 +13,11 @@ USGS-parameter-code -> CF mapping exists; * the monitoring location is the CF "discrete sampling geometry" instance dimension (``cf_role = "timeseries_id"``), with ``longitude`` / ``latitude`` - coordinates when point geometry is available; -* dataset-level attributes carry ``Conventions``, provenance, and the - request URL. + coordinates from the response geometry (whether or not geopandas is + installed), plus ``hydrologic_unit_code`` / ``state_name`` when the + metadata call already carries them; +* dataset-level attributes carry ``Conventions``, provenance, the request + URL, and ``date_modified`` (the most recent upstream record refresh). The wrappers call the underlying getter with its default, hash-free output -- so the large per-record UUID column is never fetched or materialized -- @@ -74,38 +76,70 @@ # --- metadata lookups (cached per process) --------------------------------- -_TS_META_CACHE: dict[str, dict[str, dict]] = {} -_FIELD_META_CACHE: dict[str, dict[str, dict]] = {} +_TS_META_CACHE: dict[str, dict] = {} +_FIELD_META_CACHE: dict[str, dict] = {} # Only the human-readable name is sourced from the metadata endpoint; units, # statistic, and parameter code all come from the values frame itself. _NAME_DESCRIPTORS = ("parameter_name", "parameter_description") +# Per-site descriptors that ride along on the same metadata call (no extra +# request) and are surfaced as instance-indexed auxiliary coordinates. Only +# fields the endpoint already returns are used -- station name/altitude live on +# the monitoring-locations endpoint and would need a separate call, so they are +# intentionally not fetched here. +_SITE_DESCRIPTORS = ("hydrologic_unit_code", "state_name") + +# CF attributes for the instance-indexed site coordinates built from the +# descriptors above. +_SITE_COORD_ATTRS = { + "hydrologic_unit_code": {"long_name": "hydrologic unit code (HUC)"}, + "state_name": {"long_name": "state name"}, +} -def _lookup(site_ids, cache, getter): - """Fetch and cache ``{parameter_code: {name descriptors}}`` for sites. - Keyed by ``parameter_code`` (stable and 1:1 with the parameter name), so - the lookup needs no hash id. One metadata call per not-yet-cached batch - of sites; the cache is keyed by site so repeated getter calls reuse it. +def _lookup(site_ids, cache, getter): + """Fetch and cache the series + site descriptors for ``site_ids``. + + Returns ``(param_meta, site_meta)``: ``param_meta`` is + ``{parameter_code: {name descriptors}}`` (keyed by the stable parameter + code, so no hash id is needed) and ``site_meta`` is + ``{monitoring_location_id: {site descriptors present in the response}}``. + One metadata call per not-yet-cached batch of sites; the cache is keyed by + site so repeated getter calls reuse it. """ sites = sorted({str(s) for s in site_ids if _pd.notna(s)}) todo = [s for s in sites if s not in cache] if todo: meta, _ = getter(monitoring_location_id=todo) for s in todo: - cache[s] = {} - if not meta.empty and "parameter_code" in meta.columns: - cols = [c for c in _NAME_DESCRIPTORS if c in meta.columns] + cache[s] = {"params": {}, "site": {}} + if not meta.empty: + name_cols = [c for c in _NAME_DESCRIPTORS if c in meta.columns] + site_cols = [c for c in _SITE_DESCRIPTORS if c in meta.columns] + has_pcode = "parameter_code" in meta.columns for _, row in meta.iterrows(): site = row.get("monitoring_location_id") - pcode = row.get("parameter_code") - if site in cache and _pd.notna(pcode): - cache[site][str(pcode)] = {c: row.get(c) for c in cols} - out: dict[str, dict] = {} + if site not in cache: + continue + if has_pcode: + pcode = row.get("parameter_code") + if _pd.notna(pcode): + cache[site]["params"][str(pcode)] = { + c: row.get(c) for c in name_cols + } + if not cache[site]["site"]: + desc = {c: row.get(c) for c in site_cols if _pd.notna(row.get(c))} + if desc: + cache[site]["site"] = desc + param_meta: dict[str, dict] = {} + site_meta: dict[str, dict] = {} for s in sites: - out.update(cache.get(s, {})) - return out + entry = cache.get(s, {}) + param_meta.update(entry.get("params", {})) + if entry.get("site"): + site_meta[s] = entry["site"] + return param_meta, site_meta def _timeseries_metadata(site_ids): @@ -131,6 +165,19 @@ def _first(series): return nonnull.iloc[0] if len(nonnull) else None +def _date_modified(df): + """Most recent upstream record-refresh time in the frame, ISO-8601 or None. + + ``last_modified`` reflects when each record was last refreshed in the USGS + database; the maximum is surfaced as the dataset's ``date_modified`` (ACDD) + so a reader knows how current the pull is. + """ + if df is None or "last_modified" not in getattr(df, "columns", ()): + return None + ts = _pd.to_datetime(df["last_modified"], errors="coerce", utc=True).dropna() + return ts.max().isoformat() if len(ts) else None + + def _var_attrs(desc, *, unit, pcode, stat, default_cell_method, ancillary, name): """Build the CF attribute dict for one data variable. @@ -189,18 +236,40 @@ def _empty_dataset(service, base_meta): return ds +def _lonlat(geom): + """``(lon, lat)`` from a geometry, or None if it isn't a point. + + Geometry comes back as a shapely ``Point`` when geopandas is installed but + as a plain ``[lon, lat]`` list when it is not (the OGC GeoJSON coordinates + are flattened into a list). Handle both so the spatial coordinates survive + either way; anything else (a polygon, a malformed cell) is skipped rather + than guessed. + """ + x, y = getattr(geom, "x", None), getattr(geom, "y", None) + if x is not None and y is not None: + return x, y + if isinstance(geom, (list, tuple)) and len(geom) >= 2: + try: + return float(geom[0]), float(geom[1]) + except (TypeError, ValueError): + return None + return None + + def _point_coords(df, inst): - """lon/lat per instance from a GeoDataFrame point geometry, or None.""" + """lon/lat per instance from point geometry, or None.""" if "geometry" not in df.columns: return None geo = df.dropna(subset=["geometry"]).drop_duplicates(inst) if geo.empty: return None - try: - lon = {row[inst]: row["geometry"].x for _, row in geo.iterrows()} - lat = {row[inst]: row["geometry"].y for _, row in geo.iterrows()} - except (AttributeError, TypeError): - return None # non-point geometry; skip rather than guess + lon, lat = {}, {} + for _, row in geo.iterrows(): + xy = _lonlat(row["geometry"]) + if xy is not None: + lon[row[inst]], lat[row[inst]] = xy + if not lon: + return None # no point geometry; skip rather than guess return lon, lat @@ -213,6 +282,7 @@ def _build_timeseries( *, service, series_meta, + site_meta=None, group_cols=("parameter_code", "statistic_id"), default_cell_method=None, ): @@ -309,6 +379,9 @@ def _build_timeseries( # union time axis, NaN where a given parameter has no observation. ds = _xr.merge(datasets, combine_attrs="drop_conflicts", join="outer") ds.attrs = _dataset_attrs(service, base_meta) + dm = _date_modified(df) + if dm: + ds.attrs["date_modified"] = dm ds["time"].attrs.setdefault("standard_name", "time") if _INSTANCE in ds.coords: ds[_INSTANCE].attrs.setdefault("cf_role", "timeseries_id") @@ -324,6 +397,16 @@ def _build_timeseries( ds["longitude"].attrs = {"standard_name": "longitude", "units": "degrees_east"} ds["latitude"].attrs = {"standard_name": "latitude", "units": "degrees_north"} + # Instance-indexed site descriptors carried along on the metadata call + # (HUC, state); added only where present so absent fields don't appear. + if site_meta and _INSTANCE in ds.coords: + order = list(ds[_INSTANCE].values) + for col, col_attrs in _SITE_COORD_ATTRS.items(): + vals = [site_meta.get(str(k), {}).get(col) for k in order] + if any(v is not None for v in vals): + ds = ds.assign_coords({col: (_INSTANCE, vals)}) + ds[col].attrs.update(col_attrs) + return ds @@ -341,6 +424,9 @@ def _build_stats(df, base_meta, service): ds = _xr.Dataset.from_dataframe(flat.reset_index(drop=True)) ds.attrs = _dataset_attrs(service, base_meta) ds.attrs["comment"] = "preliminary flat conversion; see module docs" + dm = _date_modified(df) + if dm: + ds.attrs["date_modified"] = dm return ds @@ -392,11 +478,13 @@ def _timeseries_wrapper(func, *, service, default_cell_method=None): @_wraps(func) def wrapper(*args, **kwargs): df, base_meta = _fetch(func, args, kwargs) + series_meta, site_meta = _timeseries_metadata(_sites(df)) return _build_timeseries( df, base_meta, service=service, - series_meta=_timeseries_metadata(_sites(df)), + series_meta=series_meta, + site_meta=site_meta, default_cell_method=default_cell_method, ) @@ -408,11 +496,13 @@ def _field_wrapper(func, *, service): @_wraps(func) def wrapper(*args, **kwargs): df, base_meta = _fetch(func, args, kwargs) + series_meta, site_meta = _field_metadata(_sites(df)) return _build_timeseries( df, base_meta, service=service, - series_meta=_field_metadata(_sites(df)), + series_meta=series_meta, + site_meta=site_meta, group_cols=("parameter_code",), default_cell_method="point", ) diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index 33fd2675..2de8f820 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -255,3 +255,88 @@ def test_mixed_units_in_one_variable_warns(): ) assert ds.sizes["monitoring_location_id"] == 2 assert ds["discharge"].attrs["units"] == "ft3 s-1" # first unit (ft^3/s) + + +def test_point_coords_from_list_geometry(): + # Without geopandas the OGC geometry is a plain [lon, lat] list, not a + # shapely Point. lon/lat must still be extracted (regression: the old + # ``.x``/``.y`` access raised AttributeError and silently dropped them). + df = _daily_frame() + df["geometry"] = [[-90.44, 43.19]] * len(df) + ds = wdx._build_timeseries( + df, _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "longitude" in ds.coords and "latitude" in ds.coords + assert ds["longitude"].sel(monitoring_location_id="USGS-1").item() == -90.44 + assert ds["latitude"].sel(monitoring_location_id="USGS-1").item() == 43.19 + assert ds["longitude"].attrs["units"] == "degrees_east" + assert ds["latitude"].attrs["standard_name"] == "latitude" + + +def test_point_coords_from_pointlike_geometry(): + # The shapely-Point path (geopandas installed) still works: any object + # exposing .x/.y is read directly. + df = _daily_frame() + df["geometry"] = [SimpleNamespace(x=-90.44, y=43.19)] * len(df) + ds = wdx._build_timeseries( + df, _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert ds["longitude"].sel(monitoring_location_id="USGS-1").item() == -90.44 + assert ds["latitude"].sel(monitoring_location_id="USGS-1").item() == 43.19 + + +def test_non_point_geometry_skipped(): + # A non-point geometry (no .x/.y, not a [lon, lat] pair) is skipped, not + # guessed -- no lon/lat coords are added. + df = _daily_frame() + df["geometry"] = [object()] * len(df) + ds = wdx._build_timeseries( + df, _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "longitude" not in ds.coords and "latitude" not in ds.coords + + +def test_date_modified_from_last_modified(): + # The newest last_modified becomes the dataset-level date_modified (ACDD). + df = _daily_frame() + df["last_modified"] = ["2024-06-01T00:00:00Z", "2024-06-10T12:00:00Z"] + ds = wdx._build_timeseries( + df, _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert ds.attrs["date_modified"].startswith("2024-06-10") + + +def test_no_date_modified_without_last_modified(): + # The default frame has no last_modified column -> no date_modified attr. + ds = wdx._build_timeseries( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "date_modified" not in ds.attrs + + +def test_site_metadata_coordinates(): + # HUC / state_name ride along on the metadata call and become + # instance-indexed auxiliary coordinates. + site_meta = { + "USGS-1": {"hydrologic_unit_code": "07070005", "state_name": "Wisconsin"} + } + ds = wdx._build_timeseries( + _daily_frame(), + _meta(), + service="daily", + series_meta=_DISCHARGE_META, + site_meta=site_meta, + ) + huc = ds["hydrologic_unit_code"].sel(monitoring_location_id="USGS-1").item() + assert huc == "07070005" + assert ds["state_name"].sel(monitoring_location_id="USGS-1").item() == "Wisconsin" + assert ds["hydrologic_unit_code"].attrs["long_name"] == "hydrologic unit code (HUC)" + + +def test_site_metadata_absent_adds_no_coords(): + # No site_meta -> no HUC/state coords (back-compat with the old signature). + ds = wdx._build_timeseries( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "hydrologic_unit_code" not in ds.coords + assert "state_name" not in ds.coords From 01b014b78c252f5ec7b086ba2505340000f7fd0b Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 27 May 2026 09:37:44 -0500 Subject: [PATCH 16/24] test(waterdata): spec stats hash-drop mocks against httpx.Client Completes the httpx migration rebase: the two stats hash-drop tests added in the PR mocked the client with spec=requests.Session, but the module now imports httpx, so the spec must be httpx.Client. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_utils_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 6f9b6f76..2cbf4852 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -404,7 +404,7 @@ def test_get_stats_data_drops_hash_ids_by_default(monkeypatch): page1.elapsed = __import__("datetime").timedelta(milliseconds=1) page1.headers = {} page1.url = "https://example/stats" - client = mock.MagicMock(spec=requests.Session) + client = mock.MagicMock(spec=httpx.Client) client.send.return_value = page1 df, _ = get_stats_data( @@ -444,7 +444,7 @@ def test_get_stats_data_keeps_hash_ids_when_opted_in(monkeypatch): page1.elapsed = __import__("datetime").timedelta(milliseconds=1) page1.headers = {} page1.url = "https://example/stats" - client = mock.MagicMock(spec=requests.Session) + client = mock.MagicMock(spec=httpx.Client) client.send.return_value = page1 df, _ = get_stats_data( From 0f24f7f88868721085848e8ea92d999546074c2c Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 27 May 2026 09:37:54 -0500 Subject: [PATCH 17/24] feat(waterdata.xarray): ragged-default CF datasets, get_samples, schema registry Build on the dense CF converter with the changes from this round of work: * Default to a CF *contiguous ragged array* (value(obs) + row_size, one instance per series) so large, very ragged multi-site pulls store only real observations; keep the NaN-filled (site, time) grid via dense=True. * Add a get_samples xarray wrapper (discrete water-quality results) with characteristic/fraction instances and detection-condition/status as ancillary (censoring) variables. * Refactor the wrappers to a data-driven design: a _Schema value object for the column vocabulary (canonical / field / samples) and a _Service registry driving one _make_getter factory, replacing the three wrapper factories and the hand-written get_samples; one ragged + one dense builder serve all vocabularies. * Tighten the tests: a mutation-validated ragged-alignment guard, the field-schema (no-statistic) path, and stronger structure/registry asserts. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/xarray.py | 586 ++++++++++++++++++++++++------ tests/waterdata_xarray_test.py | 390 ++++++++++++++++++-- 2 files changed, 825 insertions(+), 151 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 86af2e66..914308a3 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -1,30 +1,36 @@ -"""xarray-returning wrappers for the Water Data time-series getters. +"""xarray-returning wrappers for the Water Data getters. Each public function here mirrors the same-named function in :mod:`dataretrieval.waterdata`, but returns a CF-conventions -:class:`xarray.Dataset` instead of a :class:`pandas.DataFrame`. The series -descriptors that the plain getters leave behind (parameter name, units, -statistic/computation) are looked up automatically from the metadata -endpoints and written onto the dataset: - -* each observed parameter becomes a data variable with ``long_name``, - ``units`` (UDUNITS where a mapping is known), ``cell_methods`` (derived - from the statistic/computation), and ``standard_name`` where a confident - USGS-parameter-code -> CF mapping exists; -* the monitoring location is the CF "discrete sampling geometry" instance - dimension (``cf_role = "timeseries_id"``), with ``longitude`` / ``latitude`` - coordinates from the response geometry (whether or not geopandas is - installed), plus ``hydrologic_unit_code`` / ``state_name`` when the - metadata call already carries them; -* dataset-level attributes carry ``Conventions``, provenance, the request - URL, and ``date_modified`` (the most recent upstream record refresh). - -The wrappers call the underlying getter with its default, hash-free output --- so the large per-record UUID column is never fetched or materialized -- -and derive the CF attributes directly from the surviving columns -(``unit_of_measure`` -> ``units``, ``statistic_id`` -> ``cell_methods``, -``parameter_code`` -> ``standard_name``). Only the human-readable parameter -name comes from a small, cached metadata lookup keyed by ``parameter_code``. +:class:`xarray.Dataset` instead of a :class:`pandas.DataFrame`. + +By default the data is returned as a CF *contiguous ragged array* +(``featureType = "timeSeries"``): every observation is concatenated along a +single ``obs`` dimension, and each (monitoring location, parameter, +statistic) series is one instance along a ``timeseries`` dimension whose +``row_size`` records how many observations it contributes. This stores only +real observations -- no NaN fill -- so it scales to large, very ragged +multi-site pulls where record lengths differ by decades. Parameter, +statistic, unit, and location identity become per-instance metadata, and +``time`` is a 1-D coordinate along ``obs``. The trade-off is that ``time`` +is no longer a dimension you can index directly: to select by time you +first regroup the flat ``obs`` back into per-series views (e.g. with +``cf-xarray``, or via the offsets implied by ``row_size``). + +Pass ``dense=True`` for the alternative gridded layout: one named data +variable per parameter on a ``(monitoring_location_id, time)`` grid, NaN +where a series has no observation. This is ergonomic for a few overlapping +series (``ds["discharge"].sel(time=...)`` just works) but the union time +axis and NaN fill make it memory-costly for large ragged collections. + +Either way the CF metadata is derived from columns the getter already +returns (``unit_of_measure`` -> ``units``, ``statistic_id`` -> +``cell_methods``, ``parameter_code`` -> ``standard_name``), plus the +human-readable parameter name from a small cached metadata lookup; the +monitoring location carries ``cf_role = "timeseries_id"`` with ``longitude`` +/ ``latitude`` (and ``hydrologic_unit_code`` / ``state_name`` when the +metadata call already provides them), and dataset attributes carry +``Conventions``, provenance, the request URL, and ``date_modified``. This module requires the optional ``xarray`` dependency:: @@ -36,6 +42,8 @@ import datetime as _dt import re as _re import warnings as _warnings +from collections.abc import Callable +from dataclasses import dataclass, field, replace from functools import wraps as _wraps import pandas as _pd @@ -60,6 +68,7 @@ "get_latest_daily", "get_nearest_continuous", "get_peaks", + "get_samples", "get_stats_date_range", "get_stats_por", ] @@ -178,6 +187,43 @@ def _date_modified(df): return ts.max().isoformat() if len(ts) else None +def _prepare_values(df, group_cols, ancillary_cols): + """Slim the frame and coerce types, shared by the dense / ragged builders. + + Keeps only the columns we convert, parses ``time`` to naive-UTC + ``datetime64`` (xarray has no tz dtype), coerces ``value`` to numeric, and + drops rows whose ``time`` is unparseable/missing (with a warning, so + observations are never lost without a trace). Returns + ``(work, group_cols, ancillary, has_unit)`` filtered to columns present. + """ + group_cols = [c for c in group_cols if c in df.columns] + ancillary = [c for c in ancillary_cols if c in df.columns] + has_unit = "unit_of_measure" in df.columns + cols = [_INSTANCE, "time", "value", *group_cols, *ancillary] + if has_unit: + cols.append("unit_of_measure") + present = [c for c in dict.fromkeys(cols) if c in df.columns] + work = df.loc[:, present].copy() + # Instance id, time, and value are mandatory to build a series. A response + # that lacks any of them (e.g. a non-result Samples profile) has nothing to + # convert, so return an empty frame -> empty Dataset rather than KeyError. + if not {_INSTANCE, "time", "value"}.issubset(work.columns): + return work.iloc[0:0], group_cols, ancillary, has_unit + work["time"] = _pd.to_datetime( + work["time"], errors="coerce", utc=True + ).dt.tz_localize(None) + work["value"] = _pd.to_numeric(work["value"], errors="coerce") + n_before = len(work) + work = work[work["time"].notna()] + dropped = n_before - len(work) + if dropped: + _warnings.warn( + f"dropped {dropped} row(s) with an unparseable or missing time.", + stacklevel=3, + ) + return work, group_cols, ancillary, has_unit + + def _var_attrs(desc, *, unit, pcode, stat, default_cell_method, ancillary, name): """Build the CF attribute dict for one data variable. @@ -276,53 +322,100 @@ def _point_coords(df, inst): _INSTANCE = "monitoring_location_id" -def _build_timeseries( +# --- column vocabulary ----------------------------------------------------- + +# Water-quality samples (Samples DB / WQX) speak a different column vocabulary +# than the time-series getters; map their tidy backbone onto the canonical +# names the builders understand. +_SAMPLES_RENAME = { + "Location_Identifier": _INSTANCE, + "Activity_StartDateTime": "time", + "Result_Measure": "value", + "Result_MeasureUnit": "unit_of_measure", + "Result_Characteristic": "characteristic", + "Result_SampleFraction": "sample_fraction", + "Result_ResultDetectionCondition": "detection_condition", + "Result_MeasureStatusIdentifier": "status", +} +_CANONICAL_COORD_ATTRS = { + "parameter_code": {"long_name": "USGS parameter code"}, + "statistic_id": {"long_name": "USGS statistic code"}, + "unit_of_measure": {"long_name": "unit of measurement"}, +} +_SAMPLES_COORD_ATTRS = { + "characteristic": {"long_name": "characteristic name"}, + "sample_fraction": {"long_name": "sample fraction"}, + "unit_of_measure": {"long_name": "unit of measurement"}, + "detection_condition": {"long_name": "result detection condition"}, + "status": {"long_name": "result status"}, +} + + +@dataclass(frozen=True) +class _Schema: + """How a service's columns map onto the canonical builder vocabulary. + + One object describes a column dialect so a single set of builders serves + both the time-series getters and the (differently-named) Samples data: + + * ``rename`` -- source -> canonical column names (empty for the getters + that already use the canonical names); + * ``group_cols`` -- columns that identify one series (an instance); + * ``ancillary`` -- per-observation flag columns carried alongside ``value``; + * ``label_col`` -- the column whose value names the variable / ``long_name`` + (``parameter_code`` for the getters, ``characteristic`` for samples); + * ``infer_standard_name`` -- whether ``label_col`` is a USGS parameter code + eligible for a CF ``standard_name`` lookup (False for free-text + characteristics); + * ``coord_attrs`` -- ``long_name`` etc. for the per-instance metadata vars. + """ + + rename: dict = field(default_factory=dict) + group_cols: tuple = ("parameter_code", "statistic_id") + ancillary: tuple = _ANCILLARY + label_col: str = "parameter_code" + infer_standard_name: bool = True + coord_attrs: dict = field(default_factory=dict) + + +_CANONICAL = _Schema(coord_attrs=_CANONICAL_COORD_ATTRS) +_FIELD = replace(_CANONICAL, group_cols=("parameter_code",)) +_SAMPLES = _Schema( + rename=_SAMPLES_RENAME, + group_cols=("characteristic", "sample_fraction"), + ancillary=("detection_condition", "status"), + label_col="characteristic", + infer_standard_name=False, + coord_attrs=_SAMPLES_COORD_ATTRS, +) + + +def _build_dense( df, base_meta, *, service, - series_meta, + schema=_CANONICAL, + series_meta=None, site_meta=None, - group_cols=("parameter_code", "statistic_id"), default_cell_method=None, ): - """Hash-free values frame -> CF timeSeries Dataset (one var per parameter). - - The frame carries no hash columns (the wrappers fetch the default lean - output); every CF attribute is derived from ``parameter_code`` / - ``statistic_id`` / ``unit_of_measure`` already present, plus the - human-readable name from ``series_meta`` keyed by ``parameter_code``. - ``time`` and ``monitoring_location_id`` become the coordinates. + """Hash-free values frame -> dense CF timeSeries Dataset (one var/parameter). + + Parameters are pivoted onto a shared ``(monitoring_location_id, time)`` + grid, NaN where a series has no observation. Every CF attribute is derived + from ``parameter_code`` / ``statistic_id`` / ``unit_of_measure`` already + present, plus the human-readable name from ``series_meta`` keyed by + ``parameter_code``. Ergonomic for a few overlapping series but memory-costly + for large ragged collections; see :func:`_build_ragged` for the default. """ + series_meta = series_meta or {} if df is None or len(df) == 0: return _empty_dataset(service, base_meta) - - group_cols = [c for c in group_cols if c in df.columns] - ancillary = [c for c in _ANCILLARY if c in df.columns] - has_unit = "unit_of_measure" in df.columns - # Slim to just the columns we convert, so the heavy frame (and any columns - # we ignore) is not copied wholesale. - cols = [_INSTANCE, "time", "value", *group_cols, *ancillary] - if has_unit: - cols.append("unit_of_measure") - work = df.loc[:, list(dict.fromkeys(cols))].copy() - # Normalize to naive-UTC so xarray can store datetime64 (it has no tz dtype). - work["time"] = _pd.to_datetime( - work["time"], errors="coerce", utc=True - ).dt.tz_localize(None) - work["value"] = _pd.to_numeric(work["value"], errors="coerce") - - # A NaT time can't index the array and would be silently swallowed by - # ``to_xarray`` (or crash an all-NaT group). Drop such rows explicitly and - # say so, rather than losing observations without a trace. - n_before = len(work) - work = work[work["time"].notna()] - dropped = n_before - len(work) - if dropped: - _warnings.warn( - f"dropped {dropped} row(s) with an unparseable or missing time.", - stacklevel=3, - ) + df = df.rename(columns=schema.rename) if schema.rename else df + work, group_cols, ancillary, has_unit = _prepare_values( + df, schema.group_cols, schema.ancillary + ) if work.empty: return _empty_dataset(service, base_meta) @@ -410,6 +503,199 @@ def _build_timeseries( return ds +# --- ragged (CF contiguous ragged array) builders -------------------------- + + +def _assemble_ragged(work, *, inst_cols, ancillary, inst_first=()): + """Cleaned values frame -> CF contiguous-ragged Dataset skeleton. + + ``work`` must already be time-parsed (naive UTC), value-coerced, and free + of NaT times. Observations are concatenated along a single ``obs`` + dimension, sorted so each instance's rows are contiguous; ``row_size`` + records how many obs each instance contributes (the CF ``sample_dimension`` + link). ``inst_cols`` define instance identity; ``inst_first`` columns + (e.g. ``unit_of_measure``) are carried as one value per instance. Returns + ``(ds, inst_frame)`` where ``inst_frame`` is one row per instance. + """ + work = work.sort_values([*inst_cols, "time"], kind="stable") + grp = work.groupby(inst_cols, dropna=False, sort=False) + row_size = grp.size() + idx = row_size.index + inst_frame = ( + idx.to_frame(index=False) + if isinstance(idx, _pd.MultiIndex) + else _pd.DataFrame({inst_cols[0]: idx}) + ) + data_vars = { + "value": ("obs", work["value"].to_numpy()), + "row_size": ("timeseries", row_size.to_numpy().astype("int32")), + } + for c in ancillary: + data_vars[c] = ("obs", work[c].to_numpy()) + coords = {"time": ("obs", work["time"].to_numpy())} + for c in inst_cols: + coords[c] = ("timeseries", inst_frame[c].to_numpy()) + for c in inst_first: + coords[c] = ("timeseries", grp[c].first().to_numpy()) + return _xr.Dataset(data_vars, coords=coords), inst_frame + + +def _finalize_ragged( + ds, geo_df, inst_frame, *, service, base_meta, ancillary, value_attrs +): + """Attach the structural + provenance CF metadata shared by ragged builders. + + Sets the ``value`` attrs (with ``ancillary_variables`` linked), dataset + attributes (``Conventions`` / provenance / ``date_modified``), the + ``row_size`` ``sample_dimension`` link, a per-instance ``timeseries_id`` + carrying ``cf_role`` (a site alone isn't unique once it has several + series), and ``longitude`` / ``latitude`` per instance from ``geo_df``. + """ + if ancillary: + value_attrs = {**value_attrs, "ancillary_variables": " ".join(ancillary)} + ds["value"].attrs = value_attrs + ds.attrs = _dataset_attrs(service, base_meta) + dm = _date_modified(geo_df) + if dm: + ds.attrs["date_modified"] = dm + ds["time"].attrs.setdefault("standard_name", "time") + ds["row_size"].attrs = { + "long_name": "number of observations per time series", + "sample_dimension": "obs", + } + # Join the instance keys into a cf_role id, skipping null parts so a + # missing key (e.g. a characteristic with no sample fraction) doesn't + # render as a literal "nan" token. + ts_id = inst_frame.apply( + lambda r: ":".join(str(x) for x in r if _pd.notna(x)), axis=1 + ).to_numpy() + ds = ds.assign_coords(timeseries_id=("timeseries", ts_id)) + ds["timeseries_id"].attrs["cf_role"] = "timeseries_id" + ds[_INSTANCE].attrs.setdefault("long_name", "monitoring location identifier") + + coords = _point_coords(geo_df, _INSTANCE) + if coords is not None: + lon, lat = coords + sites = inst_frame[_INSTANCE].to_numpy() + ds = ds.assign_coords( + longitude=("timeseries", [lon.get(s) for s in sites]), + latitude=("timeseries", [lat.get(s) for s in sites]), + ) + ds["longitude"].attrs = {"standard_name": "longitude", "units": "degrees_east"} + ds["latitude"].attrs = {"standard_name": "latitude", "units": "degrees_north"} + return ds + + +def _build_ragged( + df, + base_meta, + *, + service, + schema=_CANONICAL, + series_meta=None, + site_meta=None, + default_cell_method=None, +): + """Hash-free values frame -> CF *contiguous ragged array* timeSeries Dataset. + + All observations are concatenated into one ``obs`` dimension; each series (a + ``schema.group_cols`` combination at a site) is an instance along + ``timeseries`` with ``row_size`` giving its length. Only real observations + are stored (no NaN fill), so this scales to large, very ragged multi-site + pulls. Per-series parameter / statistic / unit are instance coordinates; + descriptors homogeneous across instances are also written onto ``value``. + """ + series_meta = series_meta or {} + if df is None or len(df) == 0: + return _empty_dataset(service, base_meta) + work_df = df.rename(columns=schema.rename) if schema.rename else df + work, group_cols, ancillary, has_unit = _prepare_values( + work_df, schema.group_cols, schema.ancillary + ) + if work.empty: + return _empty_dataset(service, base_meta) + + inst_cols = [_INSTANCE, *group_cols] + ds, inst_frame = _assemble_ragged( + work, + inst_cols=inst_cols, + ancillary=ancillary, + inst_first=(["unit_of_measure"] if has_unit else []), + ) + + # Descriptors go on ``value`` only when homogeneous across instances; + # otherwise they vary per series and live on the instance coordinates. + labels = ( + inst_frame[schema.label_col].dropna().unique() + if schema.label_col in inst_frame + else [] + ) + stats = ( + inst_frame["statistic_id"].dropna().unique() + if "statistic_id" in inst_frame + else [] + ) + units = ( + _pd.unique(ds["unit_of_measure"].to_series().dropna()) + if "unit_of_measure" in ds.coords + else [] + ) + unit = units[0] if len(units) == 1 else None + if len(labels) == 1: + if schema.infer_standard_name: + desc, pcode = series_meta.get(str(labels[0]), {}), labels[0] + else: # free-text label (e.g. a characteristic): it *is* the name + desc, pcode = {"parameter_name": str(labels[0])}, None + value_attrs = _var_attrs( + desc, + unit=unit, + pcode=pcode, + stat=stats[0] if len(stats) == 1 else None, + default_cell_method=default_cell_method, + ancillary=(), + name="value", + ) + else: + value_attrs = { + "long_name": "measured value", + "comment": ( + "multiple series with differing metadata are stacked here; see " + "the per-timeseries coordinates for each series' identity" + ), + } + if unit is not None: + value_attrs["units"] = CF_UNIT_MAP.get(str(unit), str(unit)) + # A service-wide cell method (e.g. samples are instantaneous grabs) + # still applies when the statistic doesn't vary; the time-series getters + # leave per-parameter cell methods to the per-instance coordinates. + if default_cell_method and not schema.infer_standard_name: + value_attrs["cell_methods"] = f"time: {default_cell_method}" + + ds = _finalize_ragged( + ds, + work_df, + inst_frame, + service=service, + base_meta=base_meta, + ancillary=ancillary, + value_attrs=value_attrs, + ) + + for col, col_attrs in schema.coord_attrs.items(): + if col in ds.variables: + ds[col].attrs.update(col_attrs) + + if site_meta: + sites = inst_frame[_INSTANCE].to_numpy() + for col, col_attrs in _SITE_COORD_ATTRS.items(): + vals = [site_meta.get(str(s), {}).get(col) for s in sites] + if any(v is not None for v in vals): + ds = ds.assign_coords({col: ("timeseries", vals)}) + ds[col].attrs.update(col_attrs) + + return ds + + def _build_stats(df, base_meta, service): """Best-effort, preliminary conversion of the statistics tables. @@ -430,7 +716,7 @@ def _build_stats(df, base_meta, service): return ds -# --- public wrappers ------------------------------------------------------- +# --- public getters -------------------------------------------------------- def _sites(df): @@ -451,20 +737,30 @@ def _fetch(func, args, kwargs): return func(*args, **kwargs) -def _xr_doc(func, *, cf_metadata=True): +def _xr_doc(func, *, cf_metadata=True, allow_dense=True): """Prepend an xarray note to the wrapped getter's docstring. - ``cf_metadata=False`` describes the preliminary stats path, which emits a - flat Dataset without per-variable CF attributes. + ``cf_metadata=False`` describes the preliminary stats path (a flat Dataset + without per-variable CF attributes); ``allow_dense=False`` describes a + ragged-only path (samples), where ``dense=`` does not apply. """ - returns = ( - "a CF-conventions ``xarray.Dataset`` with series metadata populated" - if cf_metadata - else ( + if not cf_metadata: + returns = ( "a preliminary, flat ``xarray.Dataset`` (dataset-level provenance " "only; per-variable CF metadata is not yet populated)" ) - ) + elif allow_dense: + returns = ( + "a CF-conventions ``xarray.Dataset`` as a contiguous ragged array " + "(pass ``dense=True`` for the NaN-filled (site, time) grid with one " + "named variable per parameter)" + ) + else: + returns = ( + "a CF-conventions ``xarray.Dataset`` as a contiguous ragged array " + "(always ragged; discrete samples are too sparse for a dense grid, " + "so ``dense=`` does not apply)" + ) note = ( " xarray wrapper: same arguments as " f"``dataretrieval.waterdata.{func.__name__}``, but returns\n" @@ -474,69 +770,119 @@ def _xr_doc(func, *, cf_metadata=True): return note + (func.__doc__ or "") -def _timeseries_wrapper(func, *, service, default_cell_method=None): - @_wraps(func) - def wrapper(*args, **kwargs): - df, base_meta = _fetch(func, args, kwargs) - series_meta, site_meta = _timeseries_metadata(_sites(df)) - return _build_timeseries( - df, - base_meta, - service=service, - series_meta=series_meta, - site_meta=site_meta, - default_cell_method=default_cell_method, - ) +@dataclass(frozen=True) +class _Service: + """Per-service configuration that drives one public xarray getter. + + The variation across services is data, not behaviour: which getter to + call, how to look up series metadata, the column ``schema``, the fallback + ``cell_methods`` operator, whether the result is a time series or the + preliminary stats table, and whether a dense grid is offered. + """ - wrapper.__doc__ = _xr_doc(func) - return wrapper + getter: Callable + service: str + metadata: str | None = None # "timeseries" | "field" | None + schema: _Schema = _CANONICAL + default_cell_method: str | None = None + layout: str = "series" # "series" | "stats" + allow_dense: bool = True -def _field_wrapper(func, *, service): - @_wraps(func) - def wrapper(*args, **kwargs): - df, base_meta = _fetch(func, args, kwargs) - series_meta, site_meta = _field_metadata(_sites(df)) - return _build_timeseries( +def _series_metadata(source, df): + """Resolve the named metadata source at call time (stays monkeypatchable).""" + if source == "timeseries": + return _timeseries_metadata(_sites(df)) + if source == "field": + return _field_metadata(_sites(df)) + return {}, {} + + +def _make_getter(spec): + """Build the public getter for one ``_Service`` spec.""" + + has_dense = spec.layout != "stats" and spec.allow_dense + + @_wraps(spec.getter) # carry the real signature/__wrapped__ for help()/IDEs + def getter(*args, dense=False, **kwargs): + if dense and not has_dense: + _warnings.warn( + f"{spec.getter.__name__} has no dense layout; ignoring dense=True.", + stacklevel=2, + ) + dense = False + df, base_meta = _fetch(spec.getter, args, kwargs) + if spec.layout == "stats": + return _build_stats(df, base_meta, spec.service) + series_meta, site_meta = _series_metadata(spec.metadata, df) + build = _build_dense if dense else _build_ragged + return build( df, base_meta, - service=service, + service=spec.service, + schema=spec.schema, series_meta=series_meta, site_meta=site_meta, - group_cols=("parameter_code",), - default_cell_method="point", + default_cell_method=spec.default_cell_method, ) - wrapper.__doc__ = _xr_doc(func) - return wrapper - - -def _stats_wrapper(func, *, service): - @_wraps(func) - def wrapper(*args, **kwargs): - df, base_meta = _fetch(func, args, kwargs) - return _build_stats(df, base_meta, service) - - wrapper.__doc__ = _xr_doc(func, cf_metadata=False) - return wrapper + # ``@_wraps`` copied the getter's docstring; replace it with the xarray note. + getter.__doc__ = _xr_doc( + spec.getter, + cf_metadata=(spec.layout != "stats"), + allow_dense=spec.allow_dense, + ) + return getter -get_daily = _timeseries_wrapper(_api.get_daily, service="daily") -get_continuous = _timeseries_wrapper(_api.get_continuous, service="continuous") -get_latest_continuous = _timeseries_wrapper( - _api.get_latest_continuous, service="latest-continuous", default_cell_method="point" +get_daily = _make_getter(_Service(_api.get_daily, "daily", "timeseries")) +get_continuous = _make_getter(_Service(_api.get_continuous, "continuous", "timeseries")) +get_latest_continuous = _make_getter( + _Service( + _api.get_latest_continuous, + "latest-continuous", + "timeseries", + default_cell_method="point", + ) +) +get_latest_daily = _make_getter( + _Service( + _api.get_latest_daily, + "latest-daily", + "timeseries", + default_cell_method="point", + ) ) -get_latest_daily = _timeseries_wrapper( - _api.get_latest_daily, service="latest-daily", default_cell_method="point" +get_nearest_continuous = _make_getter( + _Service( + _get_nearest_continuous, + "continuous", + "timeseries", + default_cell_method="point", + ) +) +get_peaks = _make_getter( + _Service(_api.get_peaks, "peaks", "timeseries", default_cell_method="maximum") ) -get_nearest_continuous = _timeseries_wrapper( - _get_nearest_continuous, service="continuous", default_cell_method="point" +get_field_measurements = _make_getter( + _Service( + _api.get_field_measurements, + "field-measurements", + "field", + schema=_FIELD, + default_cell_method="point", + ) ) -get_peaks = _timeseries_wrapper( - _api.get_peaks, service="peaks", default_cell_method="maximum" +get_stats_por = _make_getter(_Service(_api.get_stats_por, "statistics", layout="stats")) +get_stats_date_range = _make_getter( + _Service(_api.get_stats_date_range, "statistics", layout="stats") ) -get_field_measurements = _field_wrapper( - _api.get_field_measurements, service="field-measurements" +get_samples = _make_getter( + _Service( + _api.get_samples, + "samples", + schema=_SAMPLES, + default_cell_method="point", + allow_dense=False, + ) ) -get_stats_por = _stats_wrapper(_api.get_stats_por, service="statistics") -get_stats_date_range = _stats_wrapper(_api.get_stats_date_range, service="statistics") diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index 2de8f820..af4e1981 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -51,7 +51,7 @@ def _daily_frame( def test_build_timeseries_cf_attributes(): - ds = wdx._build_timeseries( + ds = wdx._build_dense( _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META ) assert isinstance(ds, xr.Dataset) @@ -73,7 +73,7 @@ def test_build_timeseries_cf_attributes(): def test_ancillary_variables_linked(): - ds = wdx._build_timeseries( + ds = wdx._build_dense( _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META ) assert "discharge_qualifier" in ds.data_vars @@ -86,9 +86,7 @@ def test_ancillary_variables_linked(): def test_unknown_unit_passes_through_verbatim(): df = _daily_frame() df["unit_of_measure"] = "widgets/s" # units are read from the frame - ds = wdx._build_timeseries( - df, _meta(), service="daily", series_meta=_DISCHARGE_META - ) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) assert ds["discharge"].attrs["units"] == "widgets/s" @@ -97,7 +95,7 @@ def test_missing_standard_name_is_omitted(): meta = {"99999": {"parameter_name": "Mystery", "parameter_description": "Mystery"}} df = _daily_frame() df["parameter_code"] = "99999" - ds = wdx._build_timeseries(df, _meta(), service="daily", series_meta=meta) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=meta) assert "standard_name" not in ds["mystery"].attrs assert ds["mystery"].attrs["usgs_parameter_code"] == "99999" @@ -123,7 +121,7 @@ def test_multiple_parameters_outer_join_on_time(): "parameter_name": "Temperature, water", "parameter_description": "Temperature, water, degrees Celsius", } - ds = wdx._build_timeseries( + ds = wdx._build_dense( pd.concat([q, t]), _meta(), service="continuous", series_meta=meta ) assert {"discharge", "temperature_water"} <= set(ds.data_vars) @@ -140,7 +138,7 @@ def test_collision_dedups_with_warning(): a = _daily_frame(values=(100,), times=("2024-06-01",)) b = _daily_frame(values=(200,), times=("2024-06-01",)) with pytest.warns(UserWarning, match="multiple values per"): - ds = wdx._build_timeseries( + ds = wdx._build_dense( pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META ) assert "monitoring_location_id" in ds.dims @@ -152,7 +150,7 @@ def test_collision_dedups_with_warning(): def test_empty_frame_returns_dataset_with_conventions(): - ds = wdx._build_timeseries(pd.DataFrame(), _meta(), service="daily", series_meta={}) + ds = wdx._build_dense(pd.DataFrame(), _meta(), service="daily", series_meta={}) assert isinstance(ds, xr.Dataset) assert list(ds.data_vars) == [] assert ds.attrs["Conventions"] == "CF-1.11" @@ -177,7 +175,11 @@ def test_public_wrappers_exist_and_are_documented(): for name in wdx.__all__: fn = getattr(wdx, name) assert callable(fn) + # _make_getter carries the wrapped getter's name through, so the public + # symbol, its __name__, and the docstring reference all agree. + assert fn.__name__ == name assert "xarray wrapper" in (fn.__doc__ or "") + assert f"dataretrieval.waterdata.{name}" in (fn.__doc__ or "") def test_fetch_strips_include_hash(): @@ -226,9 +228,7 @@ def test_unparseable_time_dropped_with_warning(): times=("2024-06-01", "not-a-date", "2024-06-03"), ) with pytest.warns(UserWarning, match="unparseable or missing time"): - ds = wdx._build_timeseries( - df, _meta(), service="daily", series_meta=_DISCHARGE_META - ) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) assert ds.sizes["time"] == 2 # the bad-time row is gone, the good ones stay assert 110 not in ds["discharge"].values # the dropped value did not survive @@ -236,9 +236,7 @@ def test_unparseable_time_dropped_with_warning(): def test_all_unparseable_time_returns_empty_dataset(): df = _daily_frame(values=(1, 2), times=("bad-a", "bad-b")) with pytest.warns(UserWarning, match="unparseable or missing time"): - ds = wdx._build_timeseries( - df, _meta(), service="daily", series_meta=_DISCHARGE_META - ) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) assert list(ds.data_vars) == [] assert ds.attrs["Conventions"] == "CF-1.11" @@ -250,7 +248,7 @@ def test_mixed_units_in_one_variable_warns(): b = _daily_frame(site="USGS-2", values=(3,), times=("2024-06-01",)) b["unit_of_measure"] = "m3 s-1" with pytest.warns(UserWarning, match="spans multiple units"): - ds = wdx._build_timeseries( + ds = wdx._build_dense( pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META ) assert ds.sizes["monitoring_location_id"] == 2 @@ -263,9 +261,7 @@ def test_point_coords_from_list_geometry(): # ``.x``/``.y`` access raised AttributeError and silently dropped them). df = _daily_frame() df["geometry"] = [[-90.44, 43.19]] * len(df) - ds = wdx._build_timeseries( - df, _meta(), service="daily", series_meta=_DISCHARGE_META - ) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) assert "longitude" in ds.coords and "latitude" in ds.coords assert ds["longitude"].sel(monitoring_location_id="USGS-1").item() == -90.44 assert ds["latitude"].sel(monitoring_location_id="USGS-1").item() == 43.19 @@ -278,9 +274,7 @@ def test_point_coords_from_pointlike_geometry(): # exposing .x/.y is read directly. df = _daily_frame() df["geometry"] = [SimpleNamespace(x=-90.44, y=43.19)] * len(df) - ds = wdx._build_timeseries( - df, _meta(), service="daily", series_meta=_DISCHARGE_META - ) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) assert ds["longitude"].sel(monitoring_location_id="USGS-1").item() == -90.44 assert ds["latitude"].sel(monitoring_location_id="USGS-1").item() == 43.19 @@ -290,9 +284,7 @@ def test_non_point_geometry_skipped(): # guessed -- no lon/lat coords are added. df = _daily_frame() df["geometry"] = [object()] * len(df) - ds = wdx._build_timeseries( - df, _meta(), service="daily", series_meta=_DISCHARGE_META - ) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) assert "longitude" not in ds.coords and "latitude" not in ds.coords @@ -300,15 +292,13 @@ def test_date_modified_from_last_modified(): # The newest last_modified becomes the dataset-level date_modified (ACDD). df = _daily_frame() df["last_modified"] = ["2024-06-01T00:00:00Z", "2024-06-10T12:00:00Z"] - ds = wdx._build_timeseries( - df, _meta(), service="daily", series_meta=_DISCHARGE_META - ) + ds = wdx._build_dense(df, _meta(), service="daily", series_meta=_DISCHARGE_META) assert ds.attrs["date_modified"].startswith("2024-06-10") def test_no_date_modified_without_last_modified(): # The default frame has no last_modified column -> no date_modified attr. - ds = wdx._build_timeseries( + ds = wdx._build_dense( _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META ) assert "date_modified" not in ds.attrs @@ -320,7 +310,7 @@ def test_site_metadata_coordinates(): site_meta = { "USGS-1": {"hydrologic_unit_code": "07070005", "state_name": "Wisconsin"} } - ds = wdx._build_timeseries( + ds = wdx._build_dense( _daily_frame(), _meta(), service="daily", @@ -335,8 +325,346 @@ def test_site_metadata_coordinates(): def test_site_metadata_absent_adds_no_coords(): # No site_meta -> no HUC/state coords (back-compat with the old signature). - ds = wdx._build_timeseries( + ds = wdx._build_dense( _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META ) assert "hydrologic_unit_code" not in ds.coords assert "state_name" not in ds.coords + + +# --- ragged layout (the default) ------------------------------------------- + + +def test_ragged_structure_and_cf_attrs(): + # Single (site, parameter, statistic) -> one instance; value attrs are set + # because the descriptors are homogeneous across the (one) instance. + ds = wdx._build_ragged( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert set(ds.sizes) == {"obs", "timeseries"} + assert ds.sizes == {"obs": 2, "timeseries": 1} + assert ds["value"].dims == ("obs",) + assert list(ds["value"].values) == [100, 110] + assert ds["row_size"].dims == ("timeseries",) + assert int(ds["row_size"][0]) == 2 + assert ds["row_size"].attrs["sample_dimension"] == "obs" + # per-instance identity + cf_role on the synthesized timeseries_id + assert ds["monitoring_location_id"].dims == ("timeseries",) + assert ds["timeseries_id"].attrs["cf_role"] == "timeseries_id" + assert ds["timeseries_id"].values[0] == "USGS-1:00060:00003" + # homogeneous descriptors land on value + v = ds["value"] + assert v.attrs["long_name"] == "Discharge, cubic feet per second" + assert v.attrs["units"] == "ft3 s-1" + assert v.attrs["cell_methods"] == "time: mean" + assert v.attrs["standard_name"] == "water_volume_transport_in_river_channel" + assert v.attrs["ancillary_variables"] == "qualifier approval_status" + assert ds.attrs["featureType"] == "timeSeries" + # ancillary flags are per-observation; metadata is per-instance + assert ds["qualifier"].dims == ("obs",) + assert ds["approval_status"].dims == ("obs",) + assert ds["parameter_code"].dims == ("timeseries",) + assert ds["parameter_code"].values.tolist() == ["00060"] + assert ds["statistic_id"].values.tolist() == ["00003"] + assert ds["unit_of_measure"].values.tolist() == ["ft^3/s"] + assert ds["parameter_code"].attrs["long_name"] == "USGS parameter code" + + +def test_ragged_stores_only_real_observations(): + # Two sites of very different length: obs == sum of lengths, no NaN fill. + a = _daily_frame( + site="USGS-A", + values=(1, 2, 3), + times=("2024-06-01", "2024-06-02", "2024-06-03"), + ) + b = _daily_frame(site="USGS-B", values=(9,), times=("2024-06-03",)) + ds = wdx._build_ragged( + pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert ds.sizes == {"obs": 4, "timeseries": 2} # 3 + 1, not 2 x 3 grid + assert not pd.isna(ds["value"].values).any() + assert sorted(ds["row_size"].values.tolist()) == [1, 3] + + +def test_ragged_mixed_parameters_value_is_generic(): + # Mixed parameters/units -> value carries no single units/standard_name; + # the per-instance parameter_code coordinate disambiguates. + q = _daily_frame(values=(100, 110), times=("2024-06-01", "2024-06-02")) + t = pd.DataFrame( + { + "time": ["2024-06-02", "2024-06-03"], + "value": [18.0, 19.0], + "monitoring_location_id": ["USGS-1", "USGS-1"], + "parameter_code": ["00010", "00010"], + "statistic_id": ["00011", "00011"], + "unit_of_measure": ["degC", "degC"], + "qualifier": [None, None], + "approval_status": ["Approved", "Approved"], + "time_series_id": ["B", "B"], + } + ) + meta = dict(_DISCHARGE_META) + meta["00010"] = { + "parameter_name": "Temperature, water", + "parameter_description": "Temperature, water, degrees Celsius", + } + ds = wdx._build_ragged( + pd.concat([q, t]), _meta(), service="continuous", series_meta=meta + ) + assert ds.sizes == {"obs": 4, "timeseries": 2} + assert ds["value"].attrs["long_name"] == "measured value" + assert "units" not in ds["value"].attrs # ft^3/s vs degC -> not homogeneous + assert "standard_name" not in ds["value"].attrs + assert set(ds["parameter_code"].values) == {"00060", "00010"} + + +def test_ragged_keeps_duplicate_observations_without_warning(): + # The dense path warns + dedups colliding (site, time); ragged just keeps + # both observations -- no grid, so no ambiguity. Assert specifically that + # neither dense-path diagnostic fires (not "no warning at all", so an + # unrelated pandas warning can't fail the test for the wrong reason). + a = _daily_frame(values=(100,), times=("2024-06-01",)) + b = _daily_frame(values=(200,), times=("2024-06-01",)) + import warnings as _w + + with _w.catch_warnings(record=True) as caught: + _w.simplefilter("always") + ds = wdx._build_ragged( + pd.concat([a, b]), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + msgs = [str(w.message) for w in caught] + assert not any( + "multiple values per" in m or "spans multiple units" in m for m in msgs + ), msgs + assert ds.sizes["obs"] == 2 + assert sorted(ds["value"].values.tolist()) == [100, 200] + + +def test_ragged_lonlat_and_date_modified_per_instance(): + df = _daily_frame() + df["geometry"] = [[-90.44, 43.19]] * len(df) + df["last_modified"] = ["2024-06-01T00:00:00Z", "2024-06-10T12:00:00Z"] + ds = wdx._build_ragged(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert ds["longitude"].dims == ("timeseries",) + assert float(ds["longitude"][0]) == -90.44 + assert float(ds["latitude"][0]) == 43.19 + assert ds.attrs["date_modified"].startswith("2024-06-10") + + +def _instance_blocks(ds): + """Map each instance's (parameter, statistic) -> (time-sorted values, unit) + by walking the row_size offsets -- the reader's view of a ragged array.""" + starts, acc = [], 0 + for n in ds["row_size"].values.tolist(): + starts.append(acc) + acc += int(n) + starts.append(acc) + blocks = {} + for i in range(ds.sizes["timeseries"]): + sl = slice(starts[i], starts[i + 1]) + key = (str(ds["parameter_code"].values[i]), str(ds["statistic_id"].values[i])) + blocks[key] = ( + ds["value"].values[sl].tolist(), + str(ds["unit_of_measure"].values[i]), + ) + return blocks + + +def test_ragged_alignment_survives_interleaved_input(): + # The critical invariant: row_size, the contiguous obs blocks, and the + # per-instance metadata must all stay aligned even when the input rows are + # interleaved across instances and out of time order. A regression in the + # sort/group ordering would silently map values to the wrong series. + cols = [ + "monitoring_location_id", + "parameter_code", + "statistic_id", + "unit_of_measure", + "time", + "value", + ] + # Values are deliberately NON-monotonic with time within each instance, so + # a regression that ordered obs by value (instead of time) would produce a + # different sequence and fail -- "sorted values" alone wouldn't catch that. + rows = [ + ("USGS-1", "00060", "00003", "ft^3/s", "2024-06-02", 100), # later, smaller + ("USGS-1", "00010", "00011", "degC", "2024-06-01", 19), + ("USGS-1", "00060", "00001", "ft^3/s", "2024-06-03", 500), + ("USGS-1", "00060", "00003", "ft^3/s", "2024-06-01", 150), # earlier, larger + ("USGS-1", "00010", "00011", "degC", "2024-06-03", 18), + ("USGS-1", "00060", "00001", "ft^3/s", "2024-06-01", 480), + ] + df = pd.DataFrame(rows, columns=cols) + ds = wdx._build_ragged(df, _meta(), service="daily", series_meta={}) + + assert ds.sizes == {"obs": 6, "timeseries": 3} + assert int(ds["row_size"].sum()) == ds.sizes["obs"] + blocks = _instance_blocks(ds) + # each instance's values + unit are the ones that belong to it, in TIME order + # (not value order: 150 precedes 100, and 19 precedes 18) + assert blocks[("00060", "00003")] == ([150, 100], "ft^3/s") + assert blocks[("00060", "00001")] == ([480, 500], "ft^3/s") + assert blocks[("00010", "00011")] == ([19, 18], "degC") + + +def test_ragged_field_schema_without_statistic(): + # The field-measurements schema groups by parameter_code only (no + # statistic_id): the instance has no statistic segment, and the service + # default cell method fills in. + df = pd.DataFrame( + { + "monitoring_location_id": ["USGS-1", "USGS-1"], + "parameter_code": ["00060", "00060"], + "time": ["2024-06-01", "2024-06-02"], + "value": [100, 110], + "unit_of_measure": ["ft^3/s", "ft^3/s"], + } + ) + ds = wdx._build_ragged( + df, + _meta(), + service="field-measurements", + schema=wdx._FIELD, + series_meta=_DISCHARGE_META, + default_cell_method="point", + ) + assert ds.sizes == {"obs": 2, "timeseries": 1} + assert "statistic_id" not in ds.coords + assert ds["timeseries_id"].values[0] == "USGS-1:00060" # no stat segment + assert ds["value"].attrs["cell_methods"] == "time: point" + assert ds["value"].attrs["long_name"] == "Discharge, cubic feet per second" + + +# --- dense opt-out wiring --------------------------------------------------- + + +def test_wrapper_defaults_to_ragged_and_dense_opt_out(monkeypatch): + # Default wrapper output is ragged; dense=True returns the gridded layout. + monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_daily_frame(), _meta())) + monkeypatch.setattr( + wdx, "_timeseries_metadata", lambda sites: (_DISCHARGE_META, {}) + ) + + ragged = wdx.get_daily(monitoring_location_id="USGS-1") + assert "obs" in ragged.sizes and "value" in ragged.data_vars + assert "discharge" not in ragged.data_vars + + dense = wdx.get_daily(monitoring_location_id="USGS-1", dense=True) + assert "discharge" in dense.data_vars + assert "obs" not in dense.sizes + + +# --- water-quality samples -------------------------------------------------- + + +def _samples_frame(characteristics=("Temperature, water",), units=("deg C",)): + rows = [] + for ch, u in zip(characteristics, units): + rows.append( + { + "Location_Identifier": "USGS-1", + "Activity_StartDateTime": "2020-07-10T12:00:00Z", + "Result_Characteristic": ch, + "Result_SampleFraction": "Total", + "Result_Measure": 12.5, + "Result_MeasureUnit": u, + "Result_ResultDetectionCondition": None, + "Result_MeasureStatusIdentifier": "Provisional", + } + ) + return pd.DataFrame(rows) + + +def _samples_ds(frame): + """Build a samples ragged Dataset the way the get_samples service does.""" + return wdx._build_ragged( + frame, + _meta(), + service="samples", + schema=wdx._SAMPLES, + default_cell_method="point", + ) + + +def test_build_samples_single_characteristic(): + ds = _samples_ds(_samples_frame()) + assert set(ds.sizes) == {"obs", "timeseries"} + assert ds["value"].dims == ("obs",) + assert ds["characteristic"].values[0] == "Temperature, water" + assert ds["value"].attrs["long_name"] == "Temperature, water" + assert ds["value"].attrs["cell_methods"] == "time: point" + # censoring columns are ancillary, linked from value + assert "detection_condition" in ds.variables and "status" in ds.variables + assert ds["value"].attrs["ancillary_variables"] == "detection_condition status" + assert ds["timeseries_id"].attrs["cf_role"] == "timeseries_id" + + +def test_build_samples_mixed_characteristics_generic_value(): + ds = _samples_ds( + _samples_frame(("Temperature, water", "pH"), ("deg C", "std units")) + ) + assert ds.sizes["timeseries"] == 2 + assert ds["value"].attrs["long_name"] == "measured value" + assert set(ds["characteristic"].values) == {"Temperature, water", "pH"} + + +def test_get_samples_wrapper_builds_ragged(monkeypatch): + monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_samples_frame(), _meta())) + ds = wdx.get_samples(monitoringLocationIdentifier="USGS-1", include_hash=True) + assert "obs" in ds.sizes and "value" in ds.data_vars + assert ds["characteristic"].values[0] == "Temperature, water" + + +def test_prepare_values_missing_required_column_returns_empty(): + # A frame lacking a mandatory column (here: no value) must degrade to an + # empty Dataset, not raise KeyError from the column slim. + df = _daily_frame().drop(columns=["value"]) + ds = wdx._build_ragged(df, _meta(), service="daily", series_meta=_DISCHARGE_META) + assert isinstance(ds, xr.Dataset) + assert list(ds.data_vars) == [] + assert ds.attrs["Conventions"] == "CF-1.11" + + +def test_build_samples_missing_value_returns_empty(): + # A non-result Samples profile (no Result_Measure -> no "value") must not + # crash; it has nothing to convert. + df = pd.DataFrame( + { + "Location_Identifier": ["USGS-1"], + "Activity_StartDateTime": ["2020-07-10T12:00:00Z"], + "Result_Characteristic": ["pH"], + } + ) + ds = _samples_ds(df) + assert list(ds.data_vars) == [] + assert ds.attrs["Conventions"] == "CF-1.11" + + +def test_get_samples_ignores_dense_with_warning(monkeypatch): + # dense=True is advertised generically; get_samples is always ragged, so it + # must accept-and-ignore (with a warning) rather than leak dense= to the + # underlying getter (which would TypeError). + monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (_samples_frame(), _meta())) + with pytest.warns(UserWarning, match="no dense layout"): + ds = wdx.get_samples(monitoringLocationIdentifier="USGS-1", dense=True) + assert "obs" in ds.sizes # still ragged + + +def test_get_stats_ignores_dense_with_warning(monkeypatch): + # The stats layout has no dense grid either; dense=True is ignored with the + # same warning (consistent with samples), not silently swallowed. + frame = pd.DataFrame({"monitoring_location_id": ["USGS-1"], "p50_va": [120.0]}) + monkeypatch.setattr(wdx, "_fetch", lambda func, a, k: (frame, _meta())) + with pytest.warns(UserWarning, match="no dense layout"): + ds = wdx.get_stats_por(monitoring_location_id="USGS-1", dense=True) + assert isinstance(ds, xr.Dataset) + + +def test_timeseries_id_skips_missing_instance_key(): + # A NaN instance key (e.g. a characteristic with no sample fraction) must + # not render as a literal "nan" token in the cf_role timeseries_id. + df = _samples_frame() + df["Result_SampleFraction"] = None + ds = _samples_ds(df) + assert all("nan" not in tid for tid in ds["timeseries_id"].values) + assert ds["timeseries_id"].values[0] == "USGS-1:Temperature, water" From c58e96e4213e4e2fd292d98a4103c8855a529c3e Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 27 May 2026 09:38:01 -0500 Subject: [PATCH 18/24] docs(waterdata.xarray): add demo notebook for the xarray wrappers Walk through the ragged default, the dense=True opt-out, the select-by-time trade-off (and regrouping), multiple parameters, get_samples, and writing CF netCDF. Wire it into the examples toctree via an nblink. Co-Authored-By: Claude Opus 4.7 (1M context) --- demos/waterdata_xarray_demo.ipynb | 368 ++++++++++++++++++ docs/source/examples/index.rst | 12 + .../examples/waterdata_xarray_demo.nblink | 3 + 3 files changed, 383 insertions(+) create mode 100644 demos/waterdata_xarray_demo.ipynb create mode 100644 docs/source/examples/waterdata_xarray_demo.nblink diff --git a/demos/waterdata_xarray_demo.ipynb b/demos/waterdata_xarray_demo.ipynb new file mode 100644 index 00000000..ab9d521f --- /dev/null +++ b/demos/waterdata_xarray_demo.ipynb @@ -0,0 +1,368 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CF-conventions `xarray` datasets from the `waterdata` module\n", + "\n", + "The `dataretrieval.waterdata.xarray` module mirrors the `waterdata` getters\n", + "(`get_daily`, `get_continuous`, `get_peaks`, `get_samples`, …) but returns a\n", + "[CF-conventions](https://cfconventions.org/) [`xarray.Dataset`](https://docs.xarray.dev/)\n", + "instead of a `pandas.DataFrame`. Parameter names, units, statistics, and station\n", + "metadata are looked up and written onto the dataset as CF attributes, so the result\n", + "is self-describing and ready to write to netCDF.\n", + "\n", + "This notebook covers:\n", + "\n", + "1. the default **ragged** layout and how to read it;\n", + "2. **why** ragged is the default (and the `dense=True` opt-out);\n", + "3. the one trade-off you need to know — **selecting by time**;\n", + "4. multiple parameters in one pull;\n", + "5. water-quality samples (`get_samples`);\n", + "6. writing CF netCDF." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "The xarray helpers are an optional extra:\n", + "\n", + "```bash\n", + "pip install dataretrieval[xarray]\n", + "```\n", + "\n", + "As with the rest of `waterdata`, signing up for a free\n", + "[API key](https://api.waterdata.usgs.gov/signup/) gives you higher rate limits.\n", + "Import the xarray wrappers under a short alias so they don't shadow the plain\n", + "`DataFrame`-returning getters:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "from dataretrieval.waterdata import xarray as wdx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. A single time series\n", + "\n", + "Pull a year of daily mean discharge (parameter `00060`, statistic `00003`) at one\n", + "gage. The wrapper takes the same arguments as `waterdata.get_daily`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = wdx.get_daily(\n", + " monitoring_location_id=\"USGS-05407000\",\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00003\",\n", + " time=\"2023-01-01/2024-01-01\",\n", + ")\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note the shape of the result — this is a CF **contiguous ragged array**\n", + "(`featureType = \"timeSeries\"`):\n", + "\n", + "* every observation lives along a single `obs` dimension;\n", + "* each *series* — one `(monitoring location, parameter, statistic)` — is one\n", + " instance along the `timeseries` dimension;\n", + "* `row_size` records how many observations each series contributes (the CF\n", + " `sample_dimension` link), and `timeseries_id` carries `cf_role`;\n", + "* because there is a single, homogeneous parameter here, the descriptors land\n", + " directly on `value` (`long_name`, `units`, `cell_methods`, `standard_name`).\n", + "\n", + "The flag columns (`qualifier`, `approval_status`) are linked as\n", + "`ancillary_variables`, and dataset attributes carry provenance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(ds[\"value\"].attrs)\n", + "print({k: ds.attrs[k] for k in (\"Conventions\", \"featureType\", \"references\", \"date_modified\")})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Why ragged is the default\n", + "\n", + "Real collections are *ragged*: some gages have a century of record, others a few\n", + "years. Pull discharge at two gages with very different start dates and look at\n", + "`row_size`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sites = [\"USGS-05407000\", \"USGS-02238500\"] # since 1913 vs since 2008\n", + "ragged = wdx.get_daily(\n", + " monitoring_location_id=sites, parameter_code=\"00060\", statistic_id=\"00003\"\n", + ")\n", + "print(\"dims :\", dict(ragged.sizes))\n", + "print(\"row_size :\", dict(zip(ragged[\"monitoring_location_id\"].values, ragged[\"row_size\"].values)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The alternative is a **dense** `(monitoring_location_id, time)` grid — one named\n", + "variable per parameter, NaN where a series has no observation. It is convenient\n", + "(see the next section) but pays for a union time axis and NaN fill. Pass\n", + "`dense=True` to get it, and compare the in-memory size:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dense = wdx.get_daily(\n", + " monitoring_location_id=sites, parameter_code=\"00060\", statistic_id=\"00003\",\n", + " dense=True,\n", + ")\n", + "print(\"dense dims :\", dict(dense.sizes))\n", + "print(\"dense var :\", list(dense.data_vars))\n", + "print(f\"ragged {ragged.nbytes/1e6:6.2f} MB dense {dense.nbytes/1e6:6.2f} MB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With only two gages the gap is modest, but it grows fast: a whole state's daily\n", + "discharge can be 15–30% dense, where the gridded layout balloons to hundreds of\n", + "megabytes (mostly NaN) while the ragged array stores only real observations.\n", + "That is why ragged is the default." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. The trade-off: selecting by time\n", + "\n", + "This is the one thing to internalize about the ragged layout.\n", + "\n", + "In the **dense** dataset, `time` is a real dimension with an index, so\n", + "label-based selection just works:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# all sites on a given day, addressed as a (site, time) grid\n", + "dense[\"discharge\"].sel(time=\"2020-06-01\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the **ragged** dataset, `value` is one flat list along `obs`, and `time` is\n", + "*just another variable* riding along `obs` — not a dimension. So `value` cannot\n", + "be addressed as a `(site, time)` grid, and a time selection returns whatever\n", + "observations happen to match, **mixed across sites with no labels** (and\n", + "identical dates across sites collide onto the same axis):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"value dims:\", ragged[\"value\"].dims) # ('obs',) -- not (site, time)\n", + "# .sel(time=...) on the flat obs axis can't give you a per-series slice\n", + "ragged[\"value\"].sel(time=\"2020-06-01\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To recover the convenient, time-indexed view you first **regroup** the flat\n", + "`obs` back into per-series pieces using the offsets implied by `row_size`. A\n", + "tiny helper:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def series(ds, i):\n", + " \"\"\"Instance ``i`` of a ragged dataset as a time-indexed DataArray.\"\"\"\n", + " starts = np.concatenate([[0], np.cumsum(ds[\"row_size\"].values)])\n", + " sl = slice(int(starts[i]), int(starts[i + 1]))\n", + " da = ds[\"value\"].isel(obs=sl)\n", + " return da.assign_coords(time=ds[\"time\"].isel(obs=sl)).swap_dims(obs=\"time\")\n", + "\n", + "\n", + "s0 = series(ragged, 0)\n", + "print(ragged[\"timeseries_id\"].values[0])\n", + "s0.sel(time=\"2020-06-01\") # now .sel(time=...) works on a single series" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For anything beyond a one-off slice, the [`cf-xarray`](https://cf-xarray.readthedocs.io/)\n", + "package understands the CF ragged encoding (`cf_role`, `sample_dimension`) and\n", + "will regroup/decode for you. Or, when you know the pull is small and overlapping,\n", + "just ask for `dense=True` up front and use `.sel(time=...)` directly.\n", + "\n", + "**Rule of thumb:** ragged for storage and large multi-site pulls; `dense=True`\n", + "for ergonomic time-based slicing of a few overlapping series." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Multiple parameters in one pull\n", + "\n", + "When a pull mixes parameters (and units), the ragged layout keeps a single\n", + "`value` and records the parameter/unit per *instance* — so nothing is\n", + "mislabeled. The homogeneous descriptors drop off `value` and live on the\n", + "`parameter_code` / `unit_of_measure` coordinates instead:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "multi = wdx.get_daily(\n", + " monitoring_location_id=\"USGS-05407000\",\n", + " parameter_code=[\"00060\", \"00045\"], # discharge + precipitation\n", + " time=\"2023-06-01/2023-07-01\",\n", + ")\n", + "print(\"value long_name:\", multi[\"value\"].attrs.get(\"long_name\"))\n", + "print(\"per-instance parameter_code:\", multi[\"parameter_code\"].values)\n", + "print(\"per-instance unit_of_measure:\", multi[\"unit_of_measure\"].values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the `dense=True` form each parameter instead becomes its own named variable\n", + "(`discharge`, `precipitation`, …) on the shared time grid." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Water-quality samples\n", + "\n", + "`get_samples` returns discrete water-quality results in the same ragged shape:\n", + "one instance per `(monitoring location, characteristic, sample fraction)`, with\n", + "the result value plus `detection_condition` and `status` as ancillary\n", + "(censoring) variables. Characteristics are free text, so no CF `standard_name`\n", + "is inferred and non-numeric results coerce to NaN (the `detection_condition`\n", + "variable preserves non-detects)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wq = wdx.get_samples(\n", + " service=\"results\",\n", + " profile=\"basicphyschem\",\n", + " monitoringLocationIdentifier=\"USGS-05406500\",\n", + " activityStartDateLower=\"2019-01-01\",\n", + " activityStartDateUpper=\"2020-01-01\",\n", + ")\n", + "print(\"dims:\", dict(wq.sizes))\n", + "print(\"characteristics:\", sorted(set(wq[\"characteristic\"].values))[:8], \"...\")\n", + "print(\"ancillary:\", wq[\"value\"].attrs.get(\"ancillary_variables\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Writing CF netCDF\n", + "\n", + "Because the dataset already carries CF metadata in the standard ragged-array\n", + "encoding, it serializes straight to a self-describing netCDF file that\n", + "CF-aware tools (THREDDS, `cf-xarray`, Panoply, …) can read back (requires a\n", + "netCDF backend, e.g. `pip install netCDF4`):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ragged.to_netcdf(\"daily_discharge.nc\") # CF-1.11 contiguous ragged array" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References\n", + "\n", + "* CF conventions — [Discrete Sampling Geometries](https://cfconventions.org/cf-conventions/cf-conventions.html#discrete-sampling-geometries)\n", + " (contiguous ragged array representation)\n", + "* [`cf-xarray`](https://cf-xarray.readthedocs.io/) — decode/group CF DSG datasets\n", + "* [`xarray`](https://docs.xarray.dev/) documentation" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 6011fc4b..369aff01 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -15,6 +15,18 @@ covers a basic introduction to module functions and usage. WaterData_demo +CF-conventions ``xarray`` datasets from the ``waterdata`` module +---------------------------------------------------------------- +The ``waterdata.xarray`` wrappers return CF-conventions ``xarray.Dataset`` +objects (a ragged array by default, with a ``dense=True`` gridded opt-out). +This notebook demonstrates the layouts, the time-selection trade-off, and +writing CF netCDF. + +.. toctree:: + :maxdepth: 1 + + waterdata_xarray_demo + Simple uses of the ``dataretrieval`` package -------------------------------------------- diff --git a/docs/source/examples/waterdata_xarray_demo.nblink b/docs/source/examples/waterdata_xarray_demo.nblink new file mode 100644 index 00000000..9c886545 --- /dev/null +++ b/docs/source/examples/waterdata_xarray_demo.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../demos/waterdata_xarray_demo.ipynb" +} From 3274d91336ed69890b4188319b18e58e477cea5e Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 27 May 2026 10:30:23 -0500 Subject: [PATCH 19/24] fix(waterdata): repair queryables fallback, scalar properties, sediment unit Review fixes for the hash-drop path: * get_ogc_data's queryables-fallback except caught requests.HTTPError / requests.RequestException, but the module imports only httpx (no requests import), so any queryables failure raised NameError instead of falling back. Catch (httpx.HTTPError, RuntimeError, ValueError) -- the types the path actually raises (RateLimited/ServiceUnavailable subclass RuntimeError). * Guard _service_queryables against a non-dict 200 body so a malformed response falls back (caught ValueError) rather than escaping as AttributeError. * Normalize a scalar-string `properties` to a list at the get_ogc_data boundary, so get_reference_table's raw query dict can't trip `all(pd.isna("..."))` (TypeError) or a per-character set in _arrange_cols. * CF_UNIT_MAP: "tons/day" -> "ton day-1" ("short_ton" is not a valid UDUNITS name; UDUNITS "ton" is the US short ton). Add regression tests for the queryables fallback and scalar-properties paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/types.py | 3 ++- dataretrieval/waterdata/utils.py | 22 ++++++++++----- tests/waterdata_utils_test.py | 46 ++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 7 deletions(-) diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py index eb4d0a27..4e5198f7 100644 --- a/dataretrieval/waterdata/types.py +++ b/dataretrieval/waterdata/types.py @@ -95,7 +95,8 @@ "uS/cm": "uS/cm", "mg/l": "mg L-1", "mg/L": "mg L-1", - "tons/day": "short_ton day-1", + # UDUNITS 'ton' is the US short ton; 'short_ton' is not a valid UDUNITS name. + "tons/day": "ton day-1", "%": "percent", } diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 4dd7b681..e1234420 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -245,15 +245,19 @@ def _service_queryables(service: str) -> list[str]: """Return the cached queryables property list for ``service``. One HTTP GET per service per process; the list is reused for every - subsequent call. Raises ``requests.HTTPError`` on a non-200 — the - caller's ``include_hash=False`` request can't be satisfied - without it, so failing loudly is preferable to silently dropping - the server-side trim. + subsequent call. Raises on a non-200 (a ``RuntimeError`` subclass via + ``_raise_for_non_200``, or an ``httpx.HTTPError`` on a transport failure); + ``get_ogc_data`` catches those and falls back to the client-side drop. """ cached = _queryables_cache.get(service) if cached is not None: return cached body = _check_ogc_requests(endpoint=service, req_type="queryables") + if not isinstance(body, dict): + # A 200 with a non-object body would make ``body.get`` raise + # AttributeError, which get_ogc_data's fallback doesn't catch; raise a + # caught type so a malformed response falls back to the client-side drop. + raise ValueError(f"queryables response for {service!r} was not a JSON object") props = list(body.get("properties", {}).keys()) _queryables_cache[service] = props return props @@ -1427,8 +1431,14 @@ def get_ogc_data( args["service"] = service args = _switch_arg_id(args, id_name=output_id, service=service) # Capture `properties` before the id-switch so post-processing sees - # the user-facing names, not the wire-format ones. + # the user-facing names, not the wire-format ones. A scalar string + # (e.g. from ``get_reference_table``'s raw ``query`` dict, which skips + # the list-normalization the typed getters do) is wrapped to a list so + # the downstream predicates / set-builds treat it as one column rather + # than per-character. properties = args.get("properties") + if isinstance(properties, str): + properties = [properties] convert_type = args.pop("convert_type", False) include_hash = args.pop("include_hash", False) @@ -1441,7 +1451,7 @@ def get_ogc_data( if use_server_trim: try: args["properties"] = _default_non_hash_properties(service, output_id) - except (requests.HTTPError, requests.RequestException, ValueError) as exc: + except (httpx.HTTPError, RuntimeError, ValueError) as exc: logger.warning( "Could not fetch queryables for %s (%s); " "falling back to client-side hash-ID drop.", diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 2cbf4852..72d71ef0 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -927,3 +927,49 @@ def test_check_ogc_requests_raises_typed_on_5xx(httpx_mock): ) with pytest.raises(ServiceUnavailable): _check_ogc_requests(endpoint="daily", req_type="schema") + + +def _reach_fetch_sentinel(monkeypatch): + """Stub ``_fetch_once`` to raise a recognizable sentinel, so a test can + prove ``get_ogc_data`` got past the properties/queryables handling (rather + than crashing earlier) without doing any network I/O.""" + + def _boom(_args): + raise RuntimeError("reached _fetch_once") + + monkeypatch.setattr(_utils_module, "_fetch_once", _boom) + + +def test_get_ogc_data_falls_back_when_queryables_unavailable(monkeypatch): + # A queryables fetch failure (ServiceUnavailable / httpx error) must be + # caught and fall through to the client-side drop -- not crash. Regression: + # the except clause referenced an unimported ``requests``, so any failure + # raised NameError instead of falling back. + def _boom_queryables(*args, **kwargs): + raise ServiceUnavailable("503") + + monkeypatch.setattr(_utils_module, "_default_non_hash_properties", _boom_queryables) + _reach_fetch_sentinel(monkeypatch) + # Reaching the sentinel proves the queryables error was caught and the call + # continued; a NameError or the bare ServiceUnavailable("503") would not + # match and would fail the test. + with pytest.raises(RuntimeError, match="reached _fetch_once"): + _utils_module.get_ogc_data( + args={"monitoring_location_id": "USGS-1"}, + output_id="daily_id", + service="daily", + ) + + +def test_get_ogc_data_accepts_scalar_properties(monkeypatch): + # A scalar (non-list) ``properties`` -- reachable via get_reference_table's + # raw query dict -- must be treated as a single column, not crash + # ``_properties_unspecified`` (all(pd.isna("time")) -> TypeError) or become + # a per-character set in ``_arrange_cols``. + _reach_fetch_sentinel(monkeypatch) + with pytest.raises(RuntimeError, match="reached _fetch_once"): + _utils_module.get_ogc_data( + args={"properties": "time", "monitoring_location_id": "USGS-1"}, + output_id="daily_id", + service="daily", + ) From ba749797cc0a6622b046ec7bfd6ed46e94f4a8fb Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 27 May 2026 13:01:28 -0500 Subject: [PATCH 20/24] docs(waterdata.xarray): emit valid numpydoc from the wrapper docstrings _xr_doc prepended a 4-space-indented note to the wrapped getter's docstring; because the wrapped first line is unindented, inspect.getdoc's dedent broke and left the Parameters/Returns sections over-indented, so napoleon (the package uses numpydoc) mis-rendered them. Build a column-0 summary paragraph and append the cleandoc'd body so the combined docstring is valid numpydoc. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/xarray.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 914308a3..d442d53a 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -40,6 +40,7 @@ from __future__ import annotations import datetime as _dt +import inspect as _inspect import re as _re import warnings as _warnings from collections.abc import Callable @@ -761,13 +762,18 @@ def _xr_doc(func, *, cf_metadata=True, allow_dense=True): "(always ragged; discrete samples are too sparse for a dense grid, " "so ``dense=`` does not apply)" ) + # A column-0 summary paragraph + a cleandoc'd body keeps the combined + # docstring valid numpydoc (the wrapped getter's first line is unindented + # but its body is source-indented, which would otherwise break dedent and + # over-indent the Parameters/Returns sections). note = ( - " xarray wrapper: same arguments as " - f"``dataretrieval.waterdata.{func.__name__}``, but returns\n" - f" {returns}. Hash-valued ID columns are always omitted here;\n" - " the ``include_hash`` flag does not apply.\n\n" + "xarray wrapper: same arguments as " + f"``dataretrieval.waterdata.{func.__name__}``, but returns {returns}. " + "Hash-valued ID columns are always omitted here; the ``include_hash`` " + "flag does not apply." ) - return note + (func.__doc__ or "") + body = _inspect.cleandoc(func.__doc__) if func.__doc__ else "" + return f"{note}\n\n{body}" if body else note @dataclass(frozen=True) From f1755e79cdb09f93968604c183451bad7e91d1e7 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 27 May 2026 14:49:02 -0500 Subject: [PATCH 21/24] refactor(waterdata): drop the pandas-path hash-dropping; keep it in xarray Scope this PR to the xarray module. The earlier work added an include_hash flag and a hash-valued-ID drop (plus a server-side queryables whitelist) to the plain DataFrame getters; revert that public-API surface so api.py / utils.py and their tests match main, and the getters again return every column. The xarray datasets stay hash-free on their own: the timeseries/samples builders surface only the columns they convert (so per-record UUIDs and per-series join keys never appear), and _build_stats now drops the stats service's computation_id / parent_time_series_id explicitly since its flat conversion keeps every column. _fetch still swallows a stray include_hash kwarg so passing it to an xarray wrapper stays harmless. Adds tests pinning the hash-free guarantee for the stats and ragged paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/api.py | 70 +-------- dataretrieval/waterdata/utils.py | 241 +++--------------------------- dataretrieval/waterdata/xarray.py | 17 ++- tests/waterdata_test.py | 143 +++--------------- tests/waterdata_utils_test.py | 201 ------------------------- tests/waterdata_xarray_test.py | 30 ++++ 6 files changed, 90 insertions(+), 612 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index d9521e24..1ec9ed42 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -33,7 +33,6 @@ SAMPLES_URL, _check_profiles, _default_headers, - _drop_hash_columns, _get_args, get_ogc_data, get_stats_data, @@ -62,7 +61,6 @@ def get_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -195,9 +193,6 @@ def get_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -281,7 +276,6 @@ def get_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Continuous data provide instantaneous water conditions. @@ -409,9 +403,6 @@ def get_continuous( convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -505,7 +496,6 @@ def get_monitoring_locations( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Location information is basic information about the monitoring location including the name, identifier, agency responsible for data collection, and @@ -721,9 +711,6 @@ def get_monitoring_locations( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -787,7 +774,6 @@ def get_time_series_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data and continuous measurements are grouped into time series, which represent a collection of observations of a single parameter, @@ -948,9 +934,6 @@ def get_time_series_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1048,7 +1031,6 @@ def get_combined_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get combined monitoring-location and time-series metadata. @@ -1149,9 +1131,6 @@ def get_combined_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1240,7 +1219,6 @@ def get_latest_continuous( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """This endpoint provides the most recent observation for each time series of continuous data. Continuous data are collected via automated sensors @@ -1370,9 +1348,6 @@ def get_latest_continuous( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1439,7 +1414,6 @@ def get_latest_daily( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Daily data provide one data value to represent water conditions for the day. @@ -1571,9 +1545,6 @@ def get_latest_daily( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1641,7 +1612,6 @@ def get_field_measurements( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Field measurements are physically measured values collected during a visit to the monitoring location. Field measurements consist of measurements @@ -1763,9 +1733,6 @@ def get_field_measurements( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1829,7 +1796,6 @@ def get_field_measurements_metadata( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get field-measurement metadata: one row per (location, parameter) series. @@ -1885,9 +1851,6 @@ def get_field_measurements_metadata( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -1954,7 +1917,6 @@ def get_peaks( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get the annual peak streamflow / stage record for a monitoring location. @@ -2013,9 +1975,6 @@ def get_peaks( and the lexicographic-comparison pitfall. convert_type : boolean, optional If True, converts columns to appropriate types. - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- @@ -2193,7 +2152,6 @@ def get_samples( pointLocationWithinMiles: float | None = None, projectIdentifier: str | Iterable[str] | None = None, recordIdentifierUserSupplied: str | Iterable[str] | None = None, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Search Samples database for USGS water quality data. This is a wrapper function for the Samples database API. All potential @@ -2324,9 +2282,6 @@ def get_samples( recordIdentifierUserSupplied : string or iterable of strings, optional Internal AQS record identifier that returns 1 entry. Only available for the "results" service. - include_hash : boolean, optional - If False (default), drop the opaque per-activity / per-result UUID columns - (``Activity_ActivityIdentifier``, ``Result_MeasureIdentifier``). Returns ------- @@ -2376,7 +2331,7 @@ def get_samples( _check_profiles(service, profile) # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"ssl_check", "profile", "include_hash"}) + params = _get_args(locals(), exclude={"ssl_check", "profile"}) params.update({"mimeType": "text/csv"}) @@ -2399,7 +2354,6 @@ def get_samples( df = pd.read_csv(StringIO(response.text), delimiter=",") df = _attach_datetime_columns(df) - df = _drop_hash_columns(df, include_hash) return df, BaseMetadata(response) @@ -2492,7 +2446,6 @@ def get_stats_por( site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, expand_percentiles: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get day-of-year and month-of-year water data statistics from the USGS Water Data API. @@ -2571,9 +2524,6 @@ def get_stats_por( argument will return both the "values" column, containing the list of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. - include_hash : boolean, optional - If False (default), drop the hash columns (``computation_id``, - ``parent_time_series_id``); set True to keep them for joining to metadata. Examples -------- @@ -2598,13 +2548,10 @@ def get_stats_por( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles", "include_hash"}) + params = _get_args(locals(), exclude={"expand_percentiles"}) return get_stats_data( - args=params, - service="observationNormals", - expand_percentiles=expand_percentiles, - include_hash=include_hash, + args=params, service="observationNormals", expand_percentiles=expand_percentiles ) @@ -2623,7 +2570,6 @@ def get_stats_date_range( site_type_name: str | Iterable[str] | None = None, parameter_code: str | Iterable[str] | None = None, expand_percentiles: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """Get monthly and annual water data statistics from the USGS Water Data API. This service (called the "observationIntervals" endpoint on api.waterdata.usgs.gov) @@ -2706,9 +2652,6 @@ def get_stats_date_range( argument will return both the "values" column, containing the list of percentile threshold values, and a "value" column, containing the singular summary value for the other statistics. - include_hash : boolean, optional - If False (default), drop the hash columns (``computation_id``, - ``parent_time_series_id``); set True to keep them for joining to metadata. Examples -------- @@ -2734,13 +2677,12 @@ def get_stats_date_range( ... ) """ # Build argument dictionary, omitting None values - params = _get_args(locals(), exclude={"expand_percentiles", "include_hash"}) + params = _get_args(locals(), exclude={"expand_percentiles"}) return get_stats_data( args=params, service="observationIntervals", expand_percentiles=expand_percentiles, - include_hash=include_hash, ) @@ -2776,7 +2718,6 @@ def get_channel( filter: str | None = None, filter_lang: FILTER_LANG | None = None, convert_type: bool = True, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Channel measurements taken as part of streamflow field measurements. @@ -2891,9 +2832,6 @@ def get_channel( convert_type : boolean, optional If True, the function will convert the data to dates and qualifier to string vector - include_hash : boolean, optional - If False (default), drop the opaque hash-valued ID columns. Set True to - keep the secondary hashes (e.g. ``time_series_id``) that join to metadata. Returns ------- diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index e1234420..66ed1723 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -181,144 +181,6 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s # parameters and require POST with CQL2 JSON instead. _CQL2_REQUIRED_SERVICES = frozenset({"monitoring-locations"}) -# Column names whose values are server-generated hashes (UUIDs or hex -# digests): opaque, non-human-meaningful, and a payload-bloat on large -# queries. Dropped by default; opt in with ``include_hash=True``. -# Includes two kinds with different stability: -# - The per-record version UUIDs that are aliased to a service's -# ``output_id`` (``daily_id``, ``continuous_id``, …). These are -# regenerated on every record refresh, so they are NOT stable over -# time and joining/diffing on them produces spurious churn. They get -# mapped to/from ``"id"`` on the wire; both names are listed so the -# filter works on either side of ``_switch_properties_id``. -# - Secondary hash columns embedded in record payloads -# (``time_series_id``, ``field_visit_id``, ``parent_time_series_id``, -# ``field_measurements_series_id``). These ARE stable and are the -# join keys back to the metadata endpoints (e.g. ``time_series_id`` -# links a values row to ``get_time_series_metadata``); they're -# dropped only because they're opaque and bloat the payload, so a -# caller who needs to join sets ``include_hash=True`` or names -# the column in ``properties``. -# ``monitoring_location_id`` (AGENCY-ID format, e.g. ``USGS-01646500``) -# and other code columns (``parameter_code``, ``statistic_id``, …) are -# intentionally absent — they're stable, human-meaningful identifiers. -_HASH_ID_COLUMNS = frozenset( - { - "daily_id", - "continuous_id", - "latest_continuous_id", - "latest_daily_id", - "field_measurement_id", - "field_series_id", - "peak_id", - "channel_measurements_id", - "combined_meta_id", - "time_series_id", - "parent_time_series_id", - "field_visit_id", - "field_measurements_series_id", - # ``get_stats_*`` (statistics service) output — per-computation - # UUID; ``parent_time_series_id`` is already listed above. - "computation_id", - # ``get_samples`` (Samples database CSV) — per-activity and - # per-result UUIDs. The Samples service uses CamelCase column - # names rather than snake_case, but the drop logic only needs - # exact name matches so they share this set. - "Activity_ActivityIdentifier", - "Result_MeasureIdentifier", - } -) - -# Cache of per-service queryables column lists, populated on first call -# from each service when computing the default ``properties=`` for -# ``include_hash=False``. Keyed by service name; value is the full -# list of property names the server exposes for that collection. -_queryables_cache: dict[str, list[str]] = {} -# Cache of the derived non-hash property whitelist, keyed by -# ``(service, output_id)``. Both inputs determine the result, and -# both are stable per call site — re-deriving on every OGC request -# would do ~30–100 frozenset lookups per call for no reason. -_default_props_cache: dict[tuple[str, str], list[str]] = {} - - -def _service_queryables(service: str) -> list[str]: - """Return the cached queryables property list for ``service``. - - One HTTP GET per service per process; the list is reused for every - subsequent call. Raises on a non-200 (a ``RuntimeError`` subclass via - ``_raise_for_non_200``, or an ``httpx.HTTPError`` on a transport failure); - ``get_ogc_data`` catches those and falls back to the client-side drop. - """ - cached = _queryables_cache.get(service) - if cached is not None: - return cached - body = _check_ogc_requests(endpoint=service, req_type="queryables") - if not isinstance(body, dict): - # A 200 with a non-object body would make ``body.get`` raise - # AttributeError, which get_ogc_data's fallback doesn't catch; raise a - # caught type so a malformed response falls back to the client-side drop. - raise ValueError(f"queryables response for {service!r} was not a JSON object") - props = list(body.get("properties", {}).keys()) - _queryables_cache[service] = props - return props - - -def _default_non_hash_properties(service: str, output_id: str) -> list[str]: - """Build the ``properties=`` whitelist sent to the server when the - caller didn't supply one and ``include_hash=False``. - - The whitelist is the service's queryables minus :data:`_HASH_ID_COLUMNS`, - minus ``"geometry"`` (the OGC server returns geometry via the feature - envelope, not as a property — some collections reject it as a - property name), and minus the wire-format ``"id"`` column when the - service's ``output_id`` is itself a hash column (e.g. ``daily_id``). - For ``monitoring-locations``, ``id`` becomes the AGENCY-ID - ``monitoring_location_id``, so it's kept. - """ - key = (service, output_id) - cached = _default_props_cache.get(key) - if cached is not None: - return cached - drop_wire_id = output_id in _HASH_ID_COLUMNS - props = [ - p - for p in _service_queryables(service) - if p not in _HASH_ID_COLUMNS - and p != "geometry" - and not (drop_wire_id and p == "id") - ] - _default_props_cache[key] = props - return props - - -def _properties_unspecified(properties) -> bool: - """True when the caller didn't pin a ``properties`` list. - - A ``None``, empty list, or list-of-only-NaN counts as unspecified. - Centralizes the predicate so the (subtly different) ``not properties`` - vs ``properties is None`` variants across call sites stay aligned. - """ - return not properties or all(pd.isna(properties)) - - -def _drop_hash_columns( - df: pd.DataFrame, - include_hash: bool, - keep: set[str] | None = None, -) -> pd.DataFrame: - """Drop hash-valued ID columns from ``df`` when not opting in. - - When ``include_hash`` is True, returns ``df`` unchanged. Otherwise - drops every column whose name is in :data:`_HASH_ID_COLUMNS`, except - those the caller listed in ``keep`` (e.g. names appearing in an - explicit user ``properties=`` request — explicit beats default). - A no-op when no hash columns are present. - """ - if include_hash: - return df - drop = (set(df.columns) & _HASH_ID_COLUMNS) - (keep or set()) - return df.drop(columns=drop) if drop else df - def _parse_datetime(value: str) -> datetime | None: """Parse a single datetime string against the supported formats. @@ -1242,10 +1104,7 @@ def _deal_with_empty( def _arrange_cols( - df: pd.DataFrame, - properties: list[str] | None, - output_id: str, - include_hash: bool = False, + df: pd.DataFrame, properties: list[str] | None, output_id: str ) -> pd.DataFrame: """ Rearranges and renames columns in a DataFrame based on provided @@ -1260,13 +1119,6 @@ def _arrange_cols( only NaN, the function renames 'id' to output_id. output_id : str The name to which the 'id' column should be renamed if applicable. - include_hash : bool, optional - If False (default), hash-valued ID columns (see - :data:`_HASH_ID_COLUMNS`) are dropped from the result unless the - caller explicitly named them in ``properties``. If True, the - legacy behavior is preserved: hash columns are kept and the - per-record output_id columns are moved to the end of the - DataFrame when ``properties`` is unspecified. Returns ------- @@ -1278,9 +1130,7 @@ def _arrange_cols( # Rename id column to output_id df = df.rename(columns={"id": output_id}) - user_specified = not _properties_unspecified(properties) - - if user_specified: + if properties and not all(pd.isna(properties)): # Don't alias the caller's list — we mutate below. local_properties = list(properties) if "geometry" in df.columns and "geometry" not in local_properties: @@ -1291,32 +1141,22 @@ def _arrange_cols( local_properties[local_properties.index("id")] = output_id df = df.loc[:, [col for col in local_properties if col in df.columns]] - # Client-side safety net for the server-side trim done in - # ``get_ogc_data``: no-op on the happy path (server already omitted - # hash columns), drops them here when the queryables fetch failed - # and we fell back to a full payload. An explicit caller - # ``properties`` list — including ``"id"``, which resolved to - # ``output_id`` above — wins over the default. - keep: set[str] = set() - if user_specified: - keep = set(properties) - if "id" in keep: - keep.add(output_id) - df = _drop_hash_columns(df, include_hash, keep=keep) - - # Legacy ordering: when ``include_hash=True`` and ``properties`` - # is unspecified, move the per-record version IDs to the end so they - # don't crowd the front. With ``include_hash=False`` those - # columns are gone above, so this branch is a no-op. - extra_id_col = set(df.columns) & { - "latest_continuous_id", - "latest_daily_id", - "daily_id", - "continuous_id", - "field_measurement_id", - } + # Move meaningless-to-user, extra id columns to the end + # of the dataframe, if they exist + extra_id_col = set(df.columns).intersection( + { + "latest_continuous_id", + "latest_daily_id", + "daily_id", + "continuous_id", + "field_measurement_id", + } + ) - if extra_id_col and _properties_unspecified(properties): + # If the arbitrary id column is returned (either due to properties + # being none or NaN), then move it to the end of the dataframe, but + # if part of properties, keep in requested order + if extra_id_col and (properties is None or all(pd.isna(properties))): id_col_order = [col for col in df.columns if col not in extra_id_col] + list( extra_id_col ) @@ -1431,38 +1271,12 @@ def get_ogc_data( args["service"] = service args = _switch_arg_id(args, id_name=output_id, service=service) # Capture `properties` before the id-switch so post-processing sees - # the user-facing names, not the wire-format ones. A scalar string - # (e.g. from ``get_reference_table``'s raw ``query`` dict, which skips - # the list-normalization the typed getters do) is wrapped to a list so - # the downstream predicates / set-builds treat it as one column rather - # than per-character. + # the user-facing names, not the wire-format ones. properties = args.get("properties") - if isinstance(properties, str): - properties = [properties] + args["properties"] = _switch_properties_id( + properties, id_name=output_id, service=service + ) convert_type = args.pop("convert_type", False) - include_hash = args.pop("include_hash", False) - - # When the caller didn't pin ``properties`` and isn't opting into - # hash IDs, try a server-side whitelist of the non-hash columns so - # the server skips serializing UUID/hex fields. On any queryables - # failure, fall through to the full payload — ``_arrange_cols`` - # post-processes the drop as a safety net. - use_server_trim = not include_hash and _properties_unspecified(properties) - if use_server_trim: - try: - args["properties"] = _default_non_hash_properties(service, output_id) - except (httpx.HTTPError, RuntimeError, ValueError) as exc: - logger.warning( - "Could not fetch queryables for %s (%s); " - "falling back to client-side hash-ID drop.", - service, - exc, - ) - use_server_trim = False - if not use_server_trim: - args["properties"] = _switch_properties_id( - properties, id_name=output_id, service=service - ) args = {k: v for k, v in args.items() if v is not None} with _progress.progress_context(service=service): @@ -1470,9 +1284,7 @@ def get_ogc_data( return_list = _deal_with_empty(return_list, properties, service) if convert_type: return_list = _type_cols(return_list) - return_list = _arrange_cols( - return_list, properties, output_id, include_hash=include_hash - ) + return_list = _arrange_cols(return_list, properties, output_id) return_list = _sort_rows(return_list) return return_list, BaseMetadata(response) @@ -1658,7 +1470,6 @@ def get_stats_data( service: str, expand_percentiles: bool, client: httpx.Client | None = None, - include_hash: bool = False, ) -> tuple[pd.DataFrame, BaseMetadata]: """ Retrieves statistical data from a specified endpoint and returns it @@ -1680,9 +1491,6 @@ def get_stats_data( each percentile gets its own row in the returned dataframe. If True and user requests a computation_type other than percentiles, a percentile column is still returned. - include_hash : bool, optional - If False (default), drop the hash columns (``computation_id``, - ``parent_time_series_id``); set True to keep them for joining to metadata. Returns ------- @@ -1728,11 +1536,6 @@ def follow_up(cursor: str, client: httpx.Client) -> httpx.Response: if expand_percentiles: df = _expand_percentiles(df) - - # Drop hash IDs after ``_expand_percentiles`` — it merges on - # ``computation_id`` while exploding the percentile lists. - df = _drop_hash_columns(df, include_hash) - return df, BaseMetadata(response) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index d442d53a..b695aaad 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -707,7 +707,12 @@ def _build_stats(df, base_meta, service): """ if df is None or len(df) == 0: return _empty_dataset(service, base_meta) - flat = df.drop(columns=[c for c in ("geometry",) if c in df.columns]) + # The timeseries/samples builders surface only the columns they convert, so + # opaque hash IDs never reach those datasets. This flat path keeps every + # column, so drop the stats service's hash-valued IDs (and geometry) here to + # keep the CF dataset free of per-record UUID coordinates. + drop = ("geometry", "computation_id", "parent_time_series_id") + flat = df.drop(columns=[c for c in drop if c in df.columns]) ds = _xr.Dataset.from_dataframe(flat.reset_index(drop=True)) ds.attrs = _dataset_attrs(service, base_meta) ds.attrs["comment"] = "preliminary flat conversion; see module docs" @@ -728,11 +733,13 @@ def _sites(df): def _fetch(func, args, kwargs): - """Call the underlying getter with hash IDs forced off. + """Call the underlying getter, dropping a stray ``include_hash`` kwarg. - The xarray path never surfaces hash columns (neither the per-record - UUID nor the per-series join key), so ``include_hash`` is dropped here: - passing it has no effect and we avoid fetching a column we'd discard. + The xarray builders surface only the columns they convert, so the opaque + hash-valued ID columns (per-record UUIDs, per-series join keys) never reach + the dataset regardless of what the getter returns. ``include_hash`` is not a + parameter of the plain getters, so it is swallowed here to keep passing it + to an xarray wrapper harmless. """ kwargs.pop("include_hash", None) return func(*args, **kwargs) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index d5d9b5eb..09f66aa5 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -85,14 +85,8 @@ def test_mock_get_samples(httpx_mock): monitoringLocationIdentifier="USGS-05406500", ) assert type(df) is DataFrame - # 181 source columns + 6 derived DateTime columns − 2 hash IDs - # (Activity_ActivityIdentifier, Result_MeasureIdentifier) dropped by default. - assert df.shape == (67, 185) - assert "Activity_ActivityIdentifier" not in df.columns - assert "Result_MeasureIdentifier" not in df.columns - # Stable identifiers are preserved. - assert "Location_Identifier" in df.columns - assert "Org_Identifier" in df.columns + # 181 source columns + 6 derived DateTime columns + assert df.shape == (67, 187) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header.get("mock_header") == "value" @@ -100,29 +94,6 @@ def test_mock_get_samples(httpx_mock): assert df["Activity_StartDateTime"].notna().any() -def test_mock_get_samples_include_hash(httpx_mock): - """``include_hash=True`` restores the legacy column set.""" - request_url = ( - "https://api.waterdata.usgs.gov/samples-data/results/fullphyschem?" - "activityMediaName=Water&activityStartDateLower=2020-01-01" - "&activityStartDateUpper=2024-12-31&monitoringLocationIdentifier=USGS-05406500&mimeType=text%2Fcsv" - ) - response_file_path = "tests/data/samples_results.txt" - mock_request(httpx_mock, request_url, response_file_path) - df, _md = get_samples( - service="results", - profile="fullphyschem", - activityMediaName="Water", - activityStartDateLower="2020-01-01", - activityStartDateUpper="2024-12-31", - monitoringLocationIdentifier="USGS-05406500", - include_hash=True, - ) - assert df.shape == (67, 187) - assert "Activity_ActivityIdentifier" in df.columns - assert "Result_MeasureIdentifier" in df.columns - - def test_mock_get_samples_summary(httpx_mock): """Tests USGS Samples summary query""" request_url = ( @@ -248,11 +219,10 @@ def test_samples_results(): activityStartDateLower="2024-10-01", activityStartDateUpper="2025-04-24", ) - # Stable identifiers are kept; hash IDs (Activity_ActivityIdentifier, - # Result_MeasureIdentifier) are dropped by default. - assert "Location_Identifier" in df.columns - assert "Activity_ActivityIdentifier" not in df.columns - assert "Result_MeasureIdentifier" not in df.columns + assert all( + col in df.columns + for col in ["Location_Identifier", "Activity_ActivityIdentifier"] + ) assert len(df) > 0 @@ -264,10 +234,7 @@ def test_samples_activity(): monitoringLocationIdentifier="USGS-06719505", ) assert len(df) > 0 - # 97 → 96 cols after dropping Activity_ActivityIdentifier - # (Result_MeasureIdentifier is not in the ``activities`` profile). - assert len(df.columns) == 96 - assert "Activity_ActivityIdentifier" not in df.columns + assert len(df.columns) == 97 assert "Location_HUCTwelveDigitCode" in df.columns @@ -313,14 +280,10 @@ def test_get_daily(): parameter_code="00060", time="2025-01-01/..", ) - # Default: hash-valued ID columns (daily_id, time_series_id) are - # dropped. Stable identifiers (monitoring_location_id, - # parameter_code, statistic_id, time) are preserved. - assert "daily_id" not in df.columns - assert "time_series_id" not in df.columns - assert "monitoring_location_id" in df.columns + assert "daily_id" in df.columns assert "geometry" in df.columns - assert df.shape[1] == 10 + assert df.columns[-1] == "daily_id" + assert df.shape[1] == 12 assert df.parameter_code.unique().tolist() == ["00060"] assert df.monitoring_location_id.unique().tolist() == ["USGS-05427718"] assert df["time"].apply(lambda x: isinstance(x, datetime.date)).all() @@ -330,22 +293,6 @@ def test_get_daily(): assert df["value"].dtype == "float64" -def test_get_daily_include_hash(): - """``include_hash=True`` restores the legacy behavior: the - per-record UUID (``daily_id``) and secondary hashes - (``time_series_id``) are present.""" - df, _ = get_daily( - monitoring_location_id="USGS-05427718", - parameter_code="00060", - time="2025-01-01/..", - include_hash=True, - ) - assert "daily_id" in df.columns - assert "time_series_id" in df.columns - assert df.columns[-1] == "daily_id" - assert df.shape[1] == 12 - - def test_get_daily_properties(): df, _ = get_daily( monitoring_location_id="USGS-05427718", @@ -391,8 +338,7 @@ def test_get_daily_no_geometry(): skip_geometry=True, ) assert "geometry" not in df.columns - # 10 default cols minus geometry, with hash IDs dropped by default. - assert df.shape[1] == 9 + assert df.shape[1] == 11 assert isinstance(df, DataFrame) @@ -408,11 +354,7 @@ def test_get_continuous(): df["time"].dtype.name.startswith("datetime64[") and "UTC" in df["time"].dtype.name ) - # Default: continuous_id (UUID) and time_series_id (hex hash) are - # dropped. Set ``include_hash=True`` to keep them. - assert "continuous_id" not in df.columns - assert "time_series_id" not in df.columns - assert "monitoring_location_id" in df.columns + assert "continuous_id" in df.columns def test_get_monitoring_locations(): @@ -437,10 +379,7 @@ def test_get_latest_continuous(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"], ) - # Default: latest_continuous_id (UUID) and time_series_id are dropped. - assert "latest_continuous_id" not in df.columns - assert "time_series_id" not in df.columns - assert "monitoring_location_id" in df.columns + assert df.columns[-1] == "latest_continuous_id" assert df.shape[0] <= 4 assert df.statistic_id.unique().tolist() == ["00011"] assert hasattr(md, "url") @@ -455,11 +394,8 @@ def test_get_latest_daily(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"], ) - # Default: latest_daily_id (UUID) and time_series_id are dropped. - assert "latest_daily_id" not in df.columns - assert "time_series_id" not in df.columns - assert "monitoring_location_id" in df.columns - assert df.shape[1] == 10 + assert "latest_daily_id" in df.columns + assert df.shape[1] == 12 assert hasattr(md, "url") assert hasattr(md, "query_time") @@ -487,12 +423,7 @@ def test_get_field_measurements(): time="2025-01-01/2025-10-01", skip_geometry=True, ) - # Default: field_measurement_id (UUID), field_measurements_series_id - # (UUID), and field_visit_id (UUID) are dropped. - assert "field_measurement_id" not in df.columns - assert "field_measurements_series_id" not in df.columns - assert "field_visit_id" not in df.columns - assert "monitoring_location_id" in df.columns + assert "field_measurement_id" in df.columns assert "geometry" not in df.columns assert df.unit_of_measure.unique().tolist() == ["ft^3/s"] assert hasattr(md, "url") @@ -550,9 +481,7 @@ def test_get_field_measurements_metadata(): df, md = get_field_measurements_metadata( monitoring_location_id="USGS-02238500", skip_geometry=True ) - # Default: field_series_id (UUID) is dropped. - assert "field_series_id" not in df.columns - assert "monitoring_location_id" in df.columns + assert "field_series_id" in df.columns assert "begin" in df.columns assert "end" in df.columns assert (df["monitoring_location_id"] == "USGS-02238500").all() @@ -580,10 +509,7 @@ def test_get_field_measurements_metadata_multi_site(): def test_get_peaks(): df, md = get_peaks(monitoring_location_id="USGS-02238500", skip_geometry=True) - # Default: peak_id (UUID) and time_series_id are dropped. - assert "peak_id" not in df.columns - assert "time_series_id" not in df.columns - assert "monitoring_location_id" in df.columns + assert "peak_id" in df.columns assert "value" in df.columns assert "water_year" in df.columns assert (df["monitoring_location_id"] == "USGS-02238500").all() @@ -654,31 +580,13 @@ def test_get_stats_por_expanded_false(): computation_type=["minimum", "percentile"], ) assert df.shape[0] == 4 - # Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols. - assert df.shape[1] == 18 - assert "computation_id" not in df.columns - assert "parent_time_series_id" not in df.columns + assert df.shape[1] == 20 # if geopandas installed, 21 columns if not assert "percentile" not in df.columns assert "percentiles" in df.columns assert type(df["percentiles"][2]) is list assert df.loc[~df["percentiles"].isna(), "value"].isnull().all() -def test_get_stats_por_include_hash(): - """``include_hash=True`` preserves the per-computation UUID - and the upstream time-series hex hash that ``get_stats_*`` used - to return unconditionally.""" - df, _ = get_stats_por( - monitoring_location_id="USGS-12451000", - parameter_code="00060", - start_date="01-01", - end_date="01-01", - include_hash=True, - ) - assert "computation_id" in df.columns - assert "parent_time_series_id" in df.columns - - def test_get_stats_date_range(): df, _ = get_stats_date_range( monitoring_location_id="USGS-12451000", @@ -689,10 +597,7 @@ def test_get_stats_date_range(): ) assert df.shape[0] == 3 - # Default: hash IDs (computation_id, parent_time_series_id) dropped → 18 cols. - assert df.shape[1] == 18 - assert "computation_id" not in df.columns - assert "parent_time_series_id" not in df.columns + assert df.shape[1] == 20 # if geopandas installed, 21 columns if not assert "interval_type" in df.columns assert "percentile" in df.columns assert df["interval_type"].isin(["month", "calendar_year", "water_year"]).all() @@ -702,12 +607,8 @@ def test_get_channel(): df, _ = get_channel(monitoring_location_id="USGS-02238500") assert df.shape[0] > 470 - # Default: channel_measurements_id (UUID) and field_visit_id (UUID) - # are dropped. 27 → 25 cols. - assert df.shape[1] == 25 # if geopandas installed, fewer if not - assert "channel_measurements_id" not in df.columns - assert "field_visit_id" not in df.columns - assert "monitoring_location_id" in df.columns + assert df.shape[1] == 27 # if geopandas installed, 21 columns if not + assert "channel_measurements_id" in df.columns class TestCheckMonitoringLocationId: diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 72d71ef0..bb5ece10 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -376,88 +376,6 @@ def test_get_stats_data_warning_includes_next_token(caplog, monkeypatch): assert any("tok2" in m for m in warnings_), warnings_ -def test_get_stats_data_drops_hash_ids_by_default(monkeypatch): - """``get_stats_data`` drops ``computation_id`` and - ``parent_time_series_id`` from the result by default — the - ``include_hash=False`` counterpart for the stats path.""" - from dataretrieval.waterdata.utils import get_stats_data - - monkeypatch.setattr( - _utils_module, - "_handle_stats_nesting", - mock.MagicMock( - return_value=pd.DataFrame( - { - "monitoring_location_id": ["USGS-1"], - "parameter_code": ["00060"], - "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], - "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], - "value": [1.0], - } - ) - ), - ) - - page1 = mock.MagicMock() - page1.status_code = 200 - page1.json.return_value = {"next": None, "features": []} - page1.elapsed = __import__("datetime").timedelta(milliseconds=1) - page1.headers = {} - page1.url = "https://example/stats" - client = mock.MagicMock(spec=httpx.Client) - client.send.return_value = page1 - - df, _ = get_stats_data( - args={"monitoring_location_id": "USGS-1"}, - service="observationNormals", - expand_percentiles=False, - client=client, - ) - assert "computation_id" not in df.columns - assert "parent_time_series_id" not in df.columns - assert "monitoring_location_id" in df.columns - assert "parameter_code" in df.columns - assert "value" in df.columns - - -def test_get_stats_data_keeps_hash_ids_when_opted_in(monkeypatch): - """``include_hash=True`` preserves the legacy stats columns.""" - from dataretrieval.waterdata.utils import get_stats_data - - monkeypatch.setattr( - _utils_module, - "_handle_stats_nesting", - mock.MagicMock( - return_value=pd.DataFrame( - { - "monitoring_location_id": ["USGS-1"], - "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], - "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], - } - ) - ), - ) - - page1 = mock.MagicMock() - page1.status_code = 200 - page1.json.return_value = {"next": None, "features": []} - page1.elapsed = __import__("datetime").timedelta(milliseconds=1) - page1.headers = {} - page1.url = "https://example/stats" - client = mock.MagicMock(spec=httpx.Client) - client.send.return_value = page1 - - df, _ = get_stats_data( - args={"monitoring_location_id": "USGS-1"}, - service="observationNormals", - expand_percentiles=False, - client=client, - include_hash=True, - ) - assert "computation_id" in df.columns - assert "parent_time_series_id" in df.columns - - def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of the columns we try to drop ("type", "properties.data") is absent, the @@ -613,79 +531,6 @@ def test_arrange_cols_keeps_geometry_when_present(): assert "geometry" in result.columns -def test_arrange_cols_drops_hash_ids_by_default(): - """Default ``include_hash=False`` drops the per-record UUID - (renamed to ``daily_id``) and secondary hash columns - (``time_series_id``), keeping stable identifiers.""" - df = pd.DataFrame( - { - "id": ["uuid-a"], - "time_series_id": ["hex-1"], - "monitoring_location_id": ["USGS-01"], - "value": [1.0], - } - ) - result = _arrange_cols(df, properties=None, output_id="daily_id") - assert "daily_id" not in result.columns - assert "time_series_id" not in result.columns - assert "monitoring_location_id" in result.columns - assert "value" in result.columns - - -def test_arrange_cols_include_hash_keeps_them(): - """``include_hash=True`` preserves the legacy behavior — hash - columns are kept and the per-record UUID lands at the end of the - column order.""" - df = pd.DataFrame( - { - "id": ["uuid-a"], - "time_series_id": ["hex-1"], - "monitoring_location_id": ["USGS-01"], - "value": [1.0], - } - ) - result = _arrange_cols(df, properties=None, output_id="daily_id", include_hash=True) - assert "daily_id" in result.columns - assert "time_series_id" in result.columns - # Legacy ordering: ``daily_id`` moves to the end. - assert result.columns[-1] == "daily_id" - - -def test_arrange_cols_explicit_properties_keep_hash_ids(): - """A user who lists a hash column in ``properties`` gets it back even - with the default ``include_hash=False`` — explicit beats default.""" - df = pd.DataFrame( - { - "id": ["uuid-a"], - "time_series_id": ["hex-1"], - "monitoring_location_id": ["USGS-01"], - "value": [1.0], - } - ) - result = _arrange_cols( - df, - properties=["daily_id", "time_series_id", "value"], - output_id="daily_id", - ) - assert "daily_id" in result.columns - assert "time_series_id" in result.columns - - -def test_arrange_cols_non_hash_output_id_kept(): - """``monitoring_location_id`` (the output_id for monitoring-locations) - is NOT a hash — the AGENCY-ID format is stable and human-meaningful — - so it must stay even under the default.""" - df = pd.DataFrame( - { - "id": ["USGS-01"], - "agency_code": ["USGS"], - } - ) - result = _arrange_cols(df, properties=None, output_id="monitoring_location_id") - assert "monitoring_location_id" in result.columns - assert result.loc[0, "monitoring_location_id"] == "USGS-01" - - # --- _format_api_dates ------------------------------------------------------- @@ -927,49 +772,3 @@ def test_check_ogc_requests_raises_typed_on_5xx(httpx_mock): ) with pytest.raises(ServiceUnavailable): _check_ogc_requests(endpoint="daily", req_type="schema") - - -def _reach_fetch_sentinel(monkeypatch): - """Stub ``_fetch_once`` to raise a recognizable sentinel, so a test can - prove ``get_ogc_data`` got past the properties/queryables handling (rather - than crashing earlier) without doing any network I/O.""" - - def _boom(_args): - raise RuntimeError("reached _fetch_once") - - monkeypatch.setattr(_utils_module, "_fetch_once", _boom) - - -def test_get_ogc_data_falls_back_when_queryables_unavailable(monkeypatch): - # A queryables fetch failure (ServiceUnavailable / httpx error) must be - # caught and fall through to the client-side drop -- not crash. Regression: - # the except clause referenced an unimported ``requests``, so any failure - # raised NameError instead of falling back. - def _boom_queryables(*args, **kwargs): - raise ServiceUnavailable("503") - - monkeypatch.setattr(_utils_module, "_default_non_hash_properties", _boom_queryables) - _reach_fetch_sentinel(monkeypatch) - # Reaching the sentinel proves the queryables error was caught and the call - # continued; a NameError or the bare ServiceUnavailable("503") would not - # match and would fail the test. - with pytest.raises(RuntimeError, match="reached _fetch_once"): - _utils_module.get_ogc_data( - args={"monitoring_location_id": "USGS-1"}, - output_id="daily_id", - service="daily", - ) - - -def test_get_ogc_data_accepts_scalar_properties(monkeypatch): - # A scalar (non-list) ``properties`` -- reachable via get_reference_table's - # raw query dict -- must be treated as a single column, not crash - # ``_properties_unspecified`` (all(pd.isna("time")) -> TypeError) or become - # a per-character set in ``_arrange_cols``. - _reach_fetch_sentinel(monkeypatch) - with pytest.raises(RuntimeError, match="reached _fetch_once"): - _utils_module.get_ogc_data( - args={"properties": "time", "monitoring_location_id": "USGS-1"}, - output_id="daily_id", - service="daily", - ) diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index af4e1981..fc3eac69 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -171,6 +171,36 @@ def test_build_stats_flat_dataset(): assert ds.attrs["Conventions"] == "CF-1.11" +def test_build_stats_drops_hash_columns(): + # The plain getters no longer drop hash IDs, so the flat stats builder must + # drop the stats service's per-record / per-series UUIDs itself to keep the + # CF dataset free of opaque coordinates. + df = pd.DataFrame( + { + "monitoring_location_id": ["USGS-1"], + "parameter_code": ["00060"], + "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], + "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], + "p50_va": [120.0], + } + ) + ds = wdx._build_stats(df, _meta(), "statistics") + assert "computation_id" not in ds.variables + assert "parent_time_series_id" not in ds.variables + assert "p50_va" in ds.data_vars + + +def test_ragged_omits_hash_columns(): + # The synthetic daily frame carries a time_series_id hash column; the ragged + # builder whitelists the columns it converts, so the hash never surfaces in + # the dataset (the timeseries path stays hash-free without the getter's + # help). + ds = wdx._build_ragged( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "time_series_id" not in ds.variables + + def test_public_wrappers_exist_and_are_documented(): for name in wdx.__all__: fn = getattr(wdx, name) From 15d37896239757d4d1ab110e0073d293efbe058c Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 27 May 2026 15:10:43 -0500 Subject: [PATCH 22/24] fix(waterdata.xarray): drop time_series_id from the flat stats dataset The stats flat conversion keeps every column, and _handle_stats_nesting surfaces all outer feature-property keys, so a time_series_id could leak into the CF dataset as an opaque-UUID variable. Add it to _build_stats's drop set (alongside computation_id / parent_time_series_id), matching the coverage of the removed pandas-path hash drop. Test asserts all three are dropped. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/xarray.py | 2 +- tests/waterdata_xarray_test.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index b695aaad..3597de02 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -711,7 +711,7 @@ def _build_stats(df, base_meta, service): # opaque hash IDs never reach those datasets. This flat path keeps every # column, so drop the stats service's hash-valued IDs (and geometry) here to # keep the CF dataset free of per-record UUID coordinates. - drop = ("geometry", "computation_id", "parent_time_series_id") + drop = ("geometry", "computation_id", "parent_time_series_id", "time_series_id") flat = df.drop(columns=[c for c in drop if c in df.columns]) ds = _xr.Dataset.from_dataframe(flat.reset_index(drop=True)) ds.attrs = _dataset_attrs(service, base_meta) diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index fc3eac69..d0013f23 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -181,12 +181,13 @@ def test_build_stats_drops_hash_columns(): "parameter_code": ["00060"], "computation_id": ["7d70379f-8452-44cd-b026-24dfa11f8503"], "parent_time_series_id": ["9cca880dec4846ec8cbdd05f3e22603e"], + "time_series_id": ["b026-24dfa11f8503"], "p50_va": [120.0], } ) ds = wdx._build_stats(df, _meta(), "statistics") - assert "computation_id" not in ds.variables - assert "parent_time_series_id" not in ds.variables + for hash_col in ("computation_id", "parent_time_series_id", "time_series_id"): + assert hash_col not in ds.variables assert "p50_va" in ds.data_vars From 055fda64f882eca20066893882e66d4363cfca66 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 27 May 2026 15:27:02 -0500 Subject: [PATCH 23/24] fix(waterdata.xarray): drop the invalid water_temperature CF standard_name USGS 00010 mapped to standard_name "water_temperature", which is not in the CF Standard Name Table (only "sea_water_temperature" exists), so the temperature variable shipped a CF-non-compliant standard_name. Remove the mapping per the map's conservative "confident match only" policy -- no valid CF name fits generic USGS freshwater/groundwater temperature, and "sea_water_temperature" is wrong-domain. The variable keeps its long_name. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/types.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py index 4e5198f7..f0b441c1 100644 --- a/dataretrieval/waterdata/types.py +++ b/dataretrieval/waterdata/types.py @@ -114,7 +114,11 @@ # codes without a confident match are left without a standard_name. CF_STANDARD_NAMES = { "00060": "water_volume_transport_in_river_channel", - "00010": "water_temperature", + # 00010 (water temperature) is intentionally omitted: ``water_temperature`` + # is NOT a CF standard name, and the only valid CF water-temperature name, + # ``sea_water_temperature``, is wrong-domain for USGS freshwater/groundwater. + # Leaving it unmapped keeps the variable's ``long_name`` without emitting an + # invalid or misleading ``standard_name``. "00065": "water_surface_height_above_reference_datum", "63160": "water_surface_height_above_reference_datum", "00045": "lwe_thickness_of_precipitation_amount", From 20f95e731107366a08bc29919c038efcb990368d Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Wed, 27 May 2026 15:41:00 -0500 Subject: [PATCH 24/24] feat(waterdata.xarray): record vertical_datum to distinguish stage parameters Gage height (00065) and stream water level (63160) share the CF standard_name water_surface_height_above_reference_datum, which left them indistinguishable. Attach a vertical_datum attribute (matching USGS's column name) recording the reference datum: "local site datum" for gage height (00065), "NAVD88" for stream water level (63160). Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/types.py | 10 ++++++++++ dataretrieval/waterdata/xarray.py | 10 +++++++++- tests/waterdata_xarray_test.py | 18 ++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py index f0b441c1..93b8d19b 100644 --- a/dataretrieval/waterdata/types.py +++ b/dataretrieval/waterdata/types.py @@ -123,3 +123,13 @@ "63160": "water_surface_height_above_reference_datum", "00045": "lwe_thickness_of_precipitation_amount", } + +# USGS parameter code -> vertical reference datum, attached as a +# ``vertical_datum`` attribute. The two water-surface-height parameters share +# the CF standard_name water_surface_height_above_reference_datum, so the datum +# distinguishes them: gage height (00065) is measured from a local site (gage) +# datum, while stream water level (63160) is referenced to NAVD88. +CF_VERTICAL_DATUM = { + "00065": "local site datum", + "63160": "NAVD88", +} diff --git a/dataretrieval/waterdata/xarray.py b/dataretrieval/waterdata/xarray.py index 3597de02..2362181f 100644 --- a/dataretrieval/waterdata/xarray.py +++ b/dataretrieval/waterdata/xarray.py @@ -59,7 +59,12 @@ from . import api as _api from .nearest import get_nearest_continuous as _get_nearest_continuous -from .types import CF_CELL_METHODS, CF_STANDARD_NAMES, CF_UNIT_MAP +from .types import ( + CF_CELL_METHODS, + CF_STANDARD_NAMES, + CF_UNIT_MAP, + CF_VERTICAL_DATUM, +) __all__ = [ "get_continuous", @@ -250,6 +255,9 @@ def _var_attrs(desc, *, unit, pcode, stat, default_cell_method, ancillary, name) sn = CF_STANDARD_NAMES.get(str(pcode)) if sn: attrs["standard_name"] = sn + datum = CF_VERTICAL_DATUM.get(str(pcode)) + if datum: + attrs["vertical_datum"] = datum attrs["usgs_parameter_code"] = str(pcode) if stat is not None and _pd.notna(stat): diff --git a/tests/waterdata_xarray_test.py b/tests/waterdata_xarray_test.py index d0013f23..19b20087 100644 --- a/tests/waterdata_xarray_test.py +++ b/tests/waterdata_xarray_test.py @@ -100,6 +100,24 @@ def test_missing_standard_name_is_omitted(): assert ds["mystery"].attrs["usgs_parameter_code"] == "99999" +def test_vertical_datum_distinguishes_stage_parameters(): + # 00065 (gage height) and 63160 (water level above NAVD88) share the CF + # standard_name water_surface_height_above_reference_datum; the vertical_datum + # attribute records the differing reference datum so they're distinguishable. + for pcode, datum in (("00065", "local site datum"), ("63160", "NAVD88")): + df = _daily_frame() + df["parameter_code"] = pcode + ds = wdx._build_ragged(df, _meta(), service="continuous", series_meta={}) + v = ds["value"] + assert v.attrs["standard_name"] == "water_surface_height_above_reference_datum" + assert v.attrs["vertical_datum"] == datum + # a parameter with no datum mapping gets no vertical_datum attr + ds = wdx._build_ragged( + _daily_frame(), _meta(), service="daily", series_meta=_DISCHARGE_META + ) + assert "vertical_datum" not in ds["value"].attrs + + def test_multiple_parameters_outer_join_on_time(): # discharge at t1,t2 ; temperature at t2,t3 -> union time, NaN fill q = _daily_frame(values=(100, 110), times=("2024-06-01", "2024-06-02"))