Skip to content

Commit 67444e5

Browse files
authored
Merge pull request #205 from ehinman/add-ref-tables
add reference tables function
2 parents 7298d3b + fa20c08 commit 67444e5

6 files changed

Lines changed: 141 additions & 23 deletions

File tree

.github/workflows/sphinx-docs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ jobs:
1818
shell: bash -l {0}
1919
run: |
2020
python -m pip install --upgrade pip
21+
pip install "docutils<0.22"
2122
pip install .[doc,nldi]
2223
ipython kernel install --name "python3" --user
2324
sudo apt update -y && sudo apt install -y latexmk texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended dvipng pandoc

dataretrieval/waterdata/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
get_latest_continuous,
2020
get_latest_daily,
2121
get_monitoring_locations,
22+
get_reference_table,
2223
get_samples,
2324
get_time_series_metadata,
2425
)
@@ -37,6 +38,7 @@
3738
"get_latest_continuous",
3839
"get_latest_daily",
3940
"get_monitoring_locations",
41+
"get_reference_table",
4042
"get_samples",
4143
"get_time_series_metadata",
4244
"_check_profiles",

dataretrieval/waterdata/api.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,17 @@
1616
from dataretrieval.utils import BaseMetadata, to_str
1717
from dataretrieval.waterdata.types import (
1818
CODE_SERVICES,
19+
METADATA_COLLECTIONS,
1920
PROFILE_LOOKUP,
2021
PROFILES,
2122
SERVICES,
2223
)
23-
from dataretrieval.waterdata.utils import SAMPLES_URL, get_ogc_data
24+
from dataretrieval.waterdata.utils import (
25+
SAMPLES_URL,
26+
get_ogc_data,
27+
_construct_api_requests,
28+
_walk_pages
29+
)
2430

2531
# Set up logger for this module
2632
logger = logging.getLogger(__name__)
@@ -1388,6 +1394,62 @@ def get_field_measurements(
13881394

13891395
return get_ogc_data(args, output_id, service)
13901396

1397+
def get_reference_table(
1398+
collection: str,
1399+
limit: Optional[int] = None,
1400+
) -> Tuple[pd.DataFrame, BaseMetadata]:
1401+
"""Get metadata reference tables for the USGS Water Data API.
1402+
1403+
Reference tables provide the range of allowable values for parameter
1404+
arguments in the waterdata module.
1405+
1406+
Parameters
1407+
----------
1408+
collection : string
1409+
One of the following options: "agency-codes", "altitude-datums",
1410+
"aquifer-codes", "aquifer-types", "coordinate-accuracy-codes",
1411+
"coordinate-datum-codes", "coordinate-method-codes", "counties",
1412+
"hydrologic-unit-codes", "medium-codes", "national-aquifer-codes",
1413+
"parameter-codes", "reliability-codes", "site-types", "states",
1414+
"statistic-codes", "topographic-codes", "time-zone-codes"
1415+
limit : numeric, optional
1416+
The optional limit parameter is used to control the subset of the
1417+
selected features that should be returned in each page. The maximum
1418+
allowable limit is 50000. It may be beneficial to set this number lower
1419+
if your internet connection is spotty. The default (None) will set the
1420+
limit to the maximum allowable limit for the service.
1421+
"""
1422+
valid_code_services = get_args(METADATA_COLLECTIONS)
1423+
if collection not in valid_code_services:
1424+
raise ValueError(
1425+
f"Invalid code service: '{collection}'. "
1426+
f"Valid options are: {valid_code_services}."
1427+
)
1428+
1429+
req = _construct_api_requests(
1430+
service=collection,
1431+
limit=limit,
1432+
skip_geometry=True,
1433+
)
1434+
# Run API request and iterate through pages if needed
1435+
return_list, response = _walk_pages(
1436+
geopd=False, req=req
1437+
)
1438+
1439+
# Give ID column a more meaningful name
1440+
if collection.endswith("s"):
1441+
return_list = return_list.rename(
1442+
columns={"id": f"{collection[:-1].replace('-', '_')}_id"}
1443+
)
1444+
else:
1445+
return_list = return_list.rename(
1446+
columns={"id": f"{collection.replace('-', '_')}_id"}
1447+
)
1448+
1449+
# Create metadata object from response
1450+
metadata = BaseMetadata(response)
1451+
return return_list, metadata
1452+
13911453

13921454
def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame:
13931455
"""Return codes from a Samples code service.

dataretrieval/waterdata/types.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,27 @@
1111
"states",
1212
]
1313

14+
METADATA_COLLECTIONS = Literal[
15+
"agency-codes",
16+
"altitude-datums",
17+
"aquifer-codes",
18+
"aquifer-types",
19+
"coordinate-accuracy-codes",
20+
"coordinate-datum-codes",
21+
"coordinate-method-codes",
22+
"counties",
23+
"hydrologic-unit-codes",
24+
"medium-codes",
25+
"national-aquifer-codes",
26+
"parameter-codes",
27+
"reliability-codes",
28+
"site-types",
29+
"states",
30+
"statistic-codes",
31+
"topographic-codes",
32+
"time-zone-codes",
33+
]
34+
1435
SERVICES = Literal[
1536
"activities",
1637
"locations",

dataretrieval/waterdata/utils.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -648,35 +648,38 @@ def _arrange_cols(
648648
pd.DataFrame or gpd.GeoDataFrame
649649
The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id.
650650
"""
651+
652+
# Rename id column to output_id
653+
df = df.rename(columns={"id": output_id})
654+
655+
# If properties are provided, filter to only those columns
656+
# plus geometry if skip_geometry is False
651657
if properties and not all(pd.isna(properties)):
652-
if "id" not in properties:
653-
# If user refers to service-specific output id in properties,
654-
# then rename the "id" column to the output_id (id column is
655-
# automatically included).
656-
if output_id in properties:
657-
df = df.rename(columns={"id": output_id})
658-
# If output id is not in properties, but user requests the plural
659-
# of the output_id (e.g. "monitoring_locations_id"), then rename
660-
# "id" to plural. This is pretty niche.
661-
else:
662-
plural = output_id.replace("_id", "s_id")
663-
if plural in properties:
664-
df = df.rename(columns={"id": plural})
658+
# Make sure geometry stays in the dataframe if skip_geometry is False
659+
if 'geometry' in df.columns and 'geometry' not in properties:
660+
properties.append('geometry')
661+
# id is technically a valid column from the service, but these
662+
# functions make the name more specific. So, if someone requests
663+
# 'id', give them the output_id column
664+
if 'id' in properties:
665+
properties[properties.index('id')] = output_id
665666
df = df.loc[:, [col for col in properties if col in df.columns]]
666-
else:
667-
df = df.rename(columns={"id": output_id})
668-
667+
669668
# Move meaningless-to-user, extra id columns to the end
670669
# of the dataframe, if they exist
671-
extra_id_cols = set(df.columns).intersection({
670+
extra_id_col = set(df.columns).intersection({
672671
"latest_continuous_id",
673672
"latest_daily_id",
674673
"daily_id",
675674
"continuous_id",
676675
"field_measurement_id"
677676
})
678-
if extra_id_cols:
679-
id_col_order = [col for col in df.columns if col not in extra_id_cols] + list(extra_id_cols)
677+
678+
# If the arbitrary id column is returned (either due to properties
679+
# being none or NaN), then move it to the end of the dataframe, but
680+
# if part of properties, keep in requested order
681+
if extra_id_col and (properties is None or all(pd.isna(properties))):
682+
id_col_order = [col for col in df.columns if col not in extra_id_col] + list(extra_id_col)
680683
df = df.loc[:, id_col_order]
681684

682685
return df

tests/waterdata_test.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
get_latest_daily,
1717
get_field_measurements,
1818
get_time_series_metadata,
19+
get_reference_table
1920
)
2021

2122
def mock_request(requests_mock, request_url, file_path):
@@ -139,11 +140,20 @@ def test_get_daily_properties():
139140
time="2025-01-01/..",
140141
properties=["daily_id", "monitoring_location_id", "parameter_code", "time", "value", "geometry"]
141142
)
142-
assert "daily_id" in df.columns
143-
assert "geometry" in df.columns
143+
assert "daily_id" == df.columns[0]
144+
assert "geometry" == df.columns[-1]
144145
assert df.shape[1] == 6
145146
assert df.parameter_code.unique().tolist() == ["00060"]
146147

148+
def test_get_daily_properties_id():
149+
df,_ = get_daily(
150+
monitoring_location_id="USGS-05427718",
151+
parameter_code="00060",
152+
time="2025-01-01/..",
153+
properties=["monitoring_location_id", "id", "parameter_code", "time", "value", "geometry"]
154+
)
155+
assert "daily_id" == df.columns[1]
156+
147157
def test_get_daily_no_geometry():
148158
df,_ = get_daily(
149159
monitoring_location_id="USGS-05427718",
@@ -187,7 +197,7 @@ def test_get_latest_continuous():
187197
monitoring_location_id=["USGS-05427718", "USGS-05427719"],
188198
parameter_code=["00060", "00065"]
189199
)
190-
assert "latest_continuous_id" in df.columns
200+
assert "latest_continuous_id" == df.columns[-1]
191201
assert df.shape[0] <= 4
192202
assert df.statistic_id.unique().tolist() == ["00011"]
193203
assert hasattr(md, 'url')
@@ -204,6 +214,15 @@ def test_get_latest_daily():
204214
assert hasattr(md, 'url')
205215
assert hasattr(md, 'query_time')
206216

217+
def test_get_latest_daily_properties_geometry():
218+
df, md = get_latest_daily(
219+
monitoring_location_id=["USGS-05427718", "USGS-05427719"],
220+
parameter_code=["00060", "00065"],
221+
properties=['monitoring_location_id', 'parameter_code', 'time', 'value', 'unit_of_measure']
222+
)
223+
assert "geometry" in df.columns
224+
assert df.shape[1] == 6
225+
207226
def test_get_field_measurements():
208227
df, md = get_field_measurements(
209228
monitoring_location_id="USGS-05427718",
@@ -227,4 +246,14 @@ def test_get_time_series_metadata():
227246
assert hasattr(md, 'url')
228247
assert hasattr(md, 'query_time')
229248

249+
def test_get_reference_table():
250+
df, md = get_reference_table("agency-codes")
251+
assert "agency_code_id" in df.columns
252+
assert df.shape[0] > 0
253+
assert hasattr(md, 'url')
254+
assert hasattr(md, 'query_time')
255+
256+
def test_get_reference_table_wrong_name():
257+
with pytest.raises(ValueError):
258+
get_reference_table("agency-cod")
230259

0 commit comments

Comments
 (0)