Merge pull request #205 from ehinman/add-ref-tables

ehinman · web-flow · commit 67444e528b33 · 2026-01-14T17:28:32.000-06:00
add reference tables function
diff --git a/.github/workflows/sphinx-docs.yml b/.github/workflows/sphinx-docs.yml
@@ -18,6 +18,7 @@ jobs:
         shell: bash -l {0}
         run: |
           python -m pip install --upgrade pip
+          pip install "docutils<0.22"
           pip install .[doc,nldi]
           ipython kernel install --name "python3" --user
           sudo apt update -y && sudo apt install -y latexmk texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended dvipng pandoc
diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py
@@ -19,6 +19,7 @@
     get_latest_continuous,
     get_latest_daily,
     get_monitoring_locations,
+    get_reference_table,
     get_samples,
     get_time_series_metadata,
 )
@@ -37,6 +38,7 @@
     "get_latest_continuous",
     "get_latest_daily",
     "get_monitoring_locations",
+    "get_reference_table",
     "get_samples",
     "get_time_series_metadata",
     "_check_profiles",
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
@@ -16,11 +16,17 @@
 from dataretrieval.utils import BaseMetadata, to_str
 from dataretrieval.waterdata.types import (
     CODE_SERVICES,
+    METADATA_COLLECTIONS,
     PROFILE_LOOKUP,
     PROFILES,
     SERVICES,
 )
-from dataretrieval.waterdata.utils import SAMPLES_URL, get_ogc_data
+from dataretrieval.waterdata.utils import (
+    SAMPLES_URL,
+    get_ogc_data,
+    _construct_api_requests,
+    _walk_pages
+)
 
 # Set up logger for this module
 logger = logging.getLogger(__name__)
@@ -1388,6 +1394,62 @@ def get_field_measurements(
 
     return get_ogc_data(args, output_id, service)
 
+def get_reference_table(
+        collection: str,
+        limit: Optional[int] = None,
+        ) -> Tuple[pd.DataFrame, BaseMetadata]:
+    """Get metadata reference tables for the USGS Water Data API.
+
+    Reference tables provide the range of allowable values for parameter
+    arguments in the waterdata module. 
+
+    Parameters
+    ----------
+    collection : string
+        One of the following options: "agency-codes", "altitude-datums",
+        "aquifer-codes", "aquifer-types", "coordinate-accuracy-codes",
+        "coordinate-datum-codes", "coordinate-method-codes", "counties",
+        "hydrologic-unit-codes", "medium-codes", "national-aquifer-codes",
+        "parameter-codes", "reliability-codes", "site-types", "states",
+        "statistic-codes", "topographic-codes", "time-zone-codes"
+    limit : numeric, optional
+        The optional limit parameter is used to control the subset of the
+        selected features that should be returned in each page. The maximum
+        allowable limit is 50000. It may be beneficial to set this number lower
+        if your internet connection is spotty. The default (None) will set the
+        limit to the maximum allowable limit for the service.
+    """
+    valid_code_services = get_args(METADATA_COLLECTIONS)
+    if collection not in valid_code_services:
+        raise ValueError(
+            f"Invalid code service: '{collection}'. "
+            f"Valid options are: {valid_code_services}."
+        )
+    
+    req = _construct_api_requests(
+        service=collection,
+        limit=limit,
+        skip_geometry=True,
+    )
+    # Run API request and iterate through pages if needed
+    return_list, response = _walk_pages(
+        geopd=False, req=req
+    )
+
+    # Give ID column a more meaningful name
+    if collection.endswith("s"):
+        return_list = return_list.rename(
+            columns={"id": f"{collection[:-1].replace('-', '_')}_id"}
+            )
+    else:
+        return_list = return_list.rename(
+            columns={"id": f"{collection.replace('-', '_')}_id"}
+            )
+
+    # Create metadata object from response
+    metadata = BaseMetadata(response)
+    return return_list, metadata
+
 
 def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame:
     """Return codes from a Samples code service.
diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py
@@ -11,6 +11,27 @@
     "states",
 ]
 
+METADATA_COLLECTIONS = Literal[
+    "agency-codes",
+    "altitude-datums",
+    "aquifer-codes",
+    "aquifer-types",
+    "coordinate-accuracy-codes",
+    "coordinate-datum-codes",
+    "coordinate-method-codes",
+    "counties",
+    "hydrologic-unit-codes",
+    "medium-codes",
+    "national-aquifer-codes",
+    "parameter-codes",
+    "reliability-codes",
+    "site-types",
+    "states",
+    "statistic-codes",
+    "topographic-codes",
+    "time-zone-codes",
+]
+
 SERVICES = Literal[
     "activities",
     "locations",
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
@@ -648,35 +648,38 @@ def _arrange_cols(
     pd.DataFrame or gpd.GeoDataFrame
         The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id.
     """
+
+    # Rename id column to output_id
+    df = df.rename(columns={"id": output_id})
+
+    # If properties are provided, filter to only those columns
+    # plus geometry if skip_geometry is False
     if properties and not all(pd.isna(properties)):
-        if "id" not in properties:
-            # If user refers to service-specific output id in properties,
-            # then rename the "id" column to the output_id (id column is
-            # automatically included).
-            if output_id in properties:
-                df = df.rename(columns={"id": output_id})
-            # If output id is not in properties, but user requests the plural
-            # of the output_id (e.g. "monitoring_locations_id"), then rename
-            # "id" to plural. This is pretty niche.
-            else:
-                plural = output_id.replace("_id", "s_id")
-                if plural in properties:
-                    df = df.rename(columns={"id": plural})
+        # Make sure geometry stays in the dataframe if skip_geometry is False
+        if 'geometry' in df.columns and 'geometry' not in properties:
+            properties.append('geometry')
+        # id is technically a valid column from the service, but these
+        # functions make the name more specific. So, if someone requests
+        # 'id', give them the output_id column
+        if 'id' in properties:
+            properties[properties.index('id')] = output_id
         df = df.loc[:, [col for col in properties if col in df.columns]]
-    else:
-        df = df.rename(columns={"id": output_id})
-    
+
     # Move meaningless-to-user, extra id columns to the end
     # of the dataframe, if they exist
-    extra_id_cols = set(df.columns).intersection({
+    extra_id_col = set(df.columns).intersection({
         "latest_continuous_id",
         "latest_daily_id",
         "daily_id",
         "continuous_id",
         "field_measurement_id"
         })
-    if extra_id_cols:
-        id_col_order = [col for col in df.columns if col not in extra_id_cols] + list(extra_id_cols)
+
+    # If the arbitrary id column is returned (either due to properties
+    # being none or NaN), then move it to the end of the dataframe, but
+    # if part of properties, keep in requested order
+    if extra_id_col and (properties is None or all(pd.isna(properties))):
+        id_col_order = [col for col in df.columns if col not in extra_id_col] + list(extra_id_col)
         df = df.loc[:, id_col_order]
     
     return df
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
@@ -16,6 +16,7 @@
     get_latest_daily,
     get_field_measurements,
     get_time_series_metadata,
+    get_reference_table
 )
 
 def mock_request(requests_mock, request_url, file_path):
@@ -139,11 +140,20 @@ def test_get_daily_properties():
         time="2025-01-01/..",
         properties=["daily_id", "monitoring_location_id", "parameter_code", "time", "value", "geometry"]
     )
-    assert "daily_id" in df.columns
-    assert "geometry" in df.columns
+    assert "daily_id" == df.columns[0]
+    assert "geometry" == df.columns[-1]
     assert df.shape[1] == 6
     assert df.parameter_code.unique().tolist() == ["00060"]
 
+def test_get_daily_properties_id():
+    df,_ = get_daily(
+        monitoring_location_id="USGS-05427718",
+        parameter_code="00060",
+        time="2025-01-01/..",
+        properties=["monitoring_location_id", "id", "parameter_code", "time", "value", "geometry"]
+    )
+    assert "daily_id" == df.columns[1]
+
 def test_get_daily_no_geometry():
     df,_ = get_daily(
         monitoring_location_id="USGS-05427718",
@@ -187,7 +197,7 @@ def test_get_latest_continuous():
         monitoring_location_id=["USGS-05427718", "USGS-05427719"],
         parameter_code=["00060", "00065"]
     )
-    assert "latest_continuous_id" in df.columns
+    assert "latest_continuous_id" == df.columns[-1]
     assert df.shape[0] <= 4
     assert df.statistic_id.unique().tolist() == ["00011"]
     assert hasattr(md, 'url')
@@ -204,6 +214,15 @@ def test_get_latest_daily():
     assert hasattr(md, 'url')
     assert hasattr(md, 'query_time')
 
+def test_get_latest_daily_properties_geometry():
+    df, md = get_latest_daily(
+        monitoring_location_id=["USGS-05427718", "USGS-05427719"],
+        parameter_code=["00060", "00065"],
+        properties=['monitoring_location_id', 'parameter_code', 'time', 'value', 'unit_of_measure']
+    )
+    assert "geometry" in df.columns
+    assert df.shape[1] == 6
+
 def test_get_field_measurements():
     df, md = get_field_measurements(
         monitoring_location_id="USGS-05427718",
@@ -227,4 +246,14 @@ def test_get_time_series_metadata():
     assert hasattr(md, 'url')
     assert hasattr(md, 'query_time')
 
+def test_get_reference_table():
+    df, md = get_reference_table("agency-codes")
+    assert "agency_code_id" in df.columns
+    assert df.shape[0] > 0
+    assert hasattr(md, 'url')
+    assert hasattr(md, 'query_time')
+
+def test_get_reference_table_wrong_name():
+    with pytest.raises(ValueError):
+        get_reference_table("agency-cod")