From 1dfe18574b82eb5d3769a80e0ed9a0e05f192aa7 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 17 Jan 2024 14:46:41 -0800 Subject: [PATCH 1/4] remote.nextstrain_dot_org: Refactor Charon "getAvailable" API response adapters Moves them closer to the actual Charon API request/response and makes the Resource/Dataset/Narrative model classes no longer tied to Charon's response format. The latter will be handy when manually instantiating them in another location outside of the current places in _ls(), but also make it easier when/if we move to another resource listing API. --- nextstrain/cli/remote/nextstrain_dot_org.py | 65 +++++++++++---------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/nextstrain/cli/remote/nextstrain_dot_org.py b/nextstrain/cli/remote/nextstrain_dot_org.py index ceb841bd..3ba7bd32 100644 --- a/nextstrain/cli/remote/nextstrain_dot_org.py +++ b/nextstrain/cli/remote/nextstrain_dot_org.py @@ -103,16 +103,15 @@ class NormalizedPath(PurePosixPath): class Resource: """ - Base class for a remote Nextstrain resource, as described by a Charon API - "getAvailable" response. + Base class for a remote Nextstrain resource described by its *path*. Concretely, either a :class:`Dataset` or :class:`Narrative` currently. """ path: NormalizedPath subresources: List['SubResource'] - def __init__(self, api_item: dict): - self.path = normalize_path(api_item["request"]) + def __init__(self, path: str): + self.path = normalize_path(path) class SubResource(NamedTuple): @@ -136,42 +135,26 @@ class SubResource(NamedTuple): class Dataset(Resource): """ - A remote Nextstrain dataset, as described by a Charon API response, - extended for the nextstrain.org RESTful API. + A remote Nextstrain dataset as described by its *path* and optional list of + *sidecars*. """ - def __init__(self, api_item): - super().__init__(api_item) + def __init__(self, path: str, sidecars: Optional[List[str]] = None): + super().__init__(path) - default_sidecars = ["root-sequence", "tip-frequencies", "measurements"] + if sidecars is None: + sidecars = ["root-sequence", "tip-frequencies", "measurements"] self.subresources = [ SubResource("application/vnd.nextstrain.dataset.main+json", ".json", primary = True), - # XXX TODO: The "sidecars" field in the /charon/getAvailable API - # response doesn't actually exist yet and its use here is - # prospective. - # - # I plan to extend the /charon/getAvailable API endpoint (or maybe - # switch to a new endpoint) in the future to include the "sidecars" - # field listing the available sidecars for each dataset, so that - # this code only has to try to fetch what is reported to exist. - # More than just reducing requests, the primary upshot is looser - # coupling by avoiding the need to update the hardcoded list of - # sidecars here and get people to upgrade their installed version - # of this CLI if we add a new sidecar in the future. Other API - # clients would also likely benefit. - # - # -trs, 18 August 2021 - # *[SubResource(f"application/vnd.nextstrain.dataset.{type}+json", ".json") - for type in api_item.get("sidecars", default_sidecars)], + for type in sidecars], ] class Narrative(Resource): """ - A remote Nextstrain narrative, as described by a Charon API response, - extended for the nextstrain.org RESTful API. + A remote Nextstrain narrative as described by its *path*. """ subresources = [ SubResource("text/vnd.nextstrain.narrative+markdown", ".md", primary = True), @@ -442,9 +425,31 @@ def matches_path(x: Resource): else: return x.path == path + def to_dataset(api_item: dict) -> Dataset: + # XXX TODO: The "sidecars" field in the /charon/getAvailable API + # response doesn't actually exist yet and its use here is + # prospective. + # + # I plan to extend the /charon/getAvailable API endpoint (or maybe + # switch to a new endpoint) in the future to include the "sidecars" + # field listing the available sidecars for each dataset, so that + # this code only has to try to fetch what is reported to exist. + # More than just reducing requests, the primary upshot is looser + # coupling by avoiding the need to update the hardcoded list of + # sidecars here and get people to upgrade their installed version + # of this CLI if we add a new sidecar in the future. Other API + # clients would also likely benefit. + # + # -trs, 18 August 2021 + # + return Dataset(api_item["request"], api_item.get("sidecars")) + + def to_narrative(api_item: dict) -> Narrative: + return Narrative(api_item["request"]) + return [ - *filter(matches_path, map(Dataset, available["datasets"])), - *filter(matches_path, map(Narrative, available["narratives"])), + *filter(matches_path, map(to_dataset, available["datasets"])), + *filter(matches_path, map(to_narrative, available["narratives"])), ] From 5f83c222cf96b52bb3ea541ba21a7f479f40ddcf Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 17 Jan 2024 14:53:21 -0800 Subject: [PATCH 2/4] remote.nextstrain_dot_org: Support downloading of core datasets which aren't in the manifest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows downloading of datasets like https://nextstrain.org/enterovirus/d68/vp1/2020-01-23 https://nextstrain.org/nextclade/sars-cov-2/21L and others, as reasonably expected.¹ It also will, with one more minor tweak to follow, allow downloading of past snapshots of resources (e.g. /zika@2023-01-01). Switches from an assert on expected media type to a conditional UserError, supported by the new Resource.__str__() method, since for single resource downloads we no longer have the assurance of knowing it exists already. ¹ --- nextstrain/cli/remote/nextstrain_dot_org.py | 32 +++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/nextstrain/cli/remote/nextstrain_dot_org.py b/nextstrain/cli/remote/nextstrain_dot_org.py index 3ba7bd32..b7a0ae4c 100644 --- a/nextstrain/cli/remote/nextstrain_dot_org.py +++ b/nextstrain/cli/remote/nextstrain_dot_org.py @@ -132,6 +132,21 @@ class SubResource(NamedTuple): file_extension: str primary: bool = False + def __str__(self) -> str: + type, subtype = self.media_type.split("/", 1) + subtype_sans_suffix, *_ = subtype.split("+", 1) + subtype_tree = tuple(subtype_sans_suffix.split(".")) + + resource = ( + "dataset" if subtype_tree[0:3] == ("vnd", "nextstrain", "dataset") else + "narrative" if subtype_tree[0:3] == ("vnd", "nextstrain", "narrative") else + self.media_type + ) + + sidecar = sidecar_suffix(self.media_type) + + return f"{resource} ({sidecar})" if sidecar else resource + class Dataset(Resource): """ @@ -327,7 +342,18 @@ def download(url: URL, local_path: Path, recursively: bool = False, dry_run: boo with requests.Session() as http: http.auth = auth(origin) - resources = _ls(origin, path, recursively = recursively, http = http) + if recursively: + resources = _ls(origin, path, recursively = recursively, http = http) + else: + # Avoid the query and just try to download the single resource. + # This saves a request for single-dataset (or narrative) downloads, + # but also allows downloading core datasets which aren't in the + # manifest. (At least until the manifest goes away.) + # -trs, 9 Nov 2022 + if narratives_only(path): + resources = [Narrative(str(path))] + else: + resources = [Dataset(str(path))] if not resources: raise UserError(f"Path {path} does not seem to exist") @@ -352,7 +378,9 @@ def download(url: URL, local_path: Path, recursively: bool = False, dry_run: boo # Check for bad response raise_for_status(response) - assert content_media_type(response) == subresource.media_type + + if content_media_type(response) != subresource.media_type: + raise UserError(f"Path {path} does not seem to be a {subresource}.") # Local destination if local_path.is_dir(): From 42a1c8642c3c44fa0469ac04dbc2ac07885495d6 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 17 Jan 2024 15:10:45 -0800 Subject: [PATCH 3/4] remote.nextstrain_dot_org: Support downloading of past snapshots of resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the same @YYYY-MM-DD suffix syntax as on the web. Support for this server-side is recently landed.¹ ¹ --- nextstrain/cli/remote/nextstrain_dot_org.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextstrain/cli/remote/nextstrain_dot_org.py b/nextstrain/cli/remote/nextstrain_dot_org.py index b7a0ae4c..b64f515a 100644 --- a/nextstrain/cli/remote/nextstrain_dot_org.py +++ b/nextstrain/cli/remote/nextstrain_dot_org.py @@ -682,7 +682,7 @@ def api_endpoint(origin: Origin, path: Union[str, PurePosixPath]) -> str: >>> api_endpoint(URL("http://localhost:5000/x/").origin, "a/b/c") 'http://localhost:5000/a/b/c' """ - return origin + "/" + urlquote(str(path).lstrip("/")) + return origin + "/" + urlquote(str(path).lstrip("/"), safe = "/@") class auth(requests.auth.AuthBase): From ad276fc64f8a153125724e0ab8af77524e70f408 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Wed, 17 Jan 2024 17:04:07 -0800 Subject: [PATCH 4/4] CHANGES: Document new `nextstrain remote download` support for non-manifest datasets --- CHANGES.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 08da27b8..64517aee 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -65,6 +65,22 @@ This release drops support for Python versions 3.6 and 3.7 and adds support for the failure. ([#341](https://github.com/nextstrain/cli/pull/341)) +* `nextstrain remote download` now supports downloading core datasets which are + only visible on the web by direct access via their URL. For example, the + following now work (where in previous versions they did not): + + nextstrain remote download https://nextstrain.org/nextclade/sars-cov-2/21L + nextstrain remote download https://nextstrain.org/enterovirus/d68/vp1/2020-01-23 + + This support also covers [past snapshots of + datasets](https://docs.nextstrain.org/en/latest/guides/versions.html), which + is a recently-added feature to nextstrain.org. For example: + + nextstrain remote download https://nextstrain.org/flu/seasonal/h3n2/ha/6y@2023-07-01 + nextstrain remote download https://nextstrain.org/ncov/gisaid/21L/global/6m@2024-01-09 + + ([#345](https://github.com/nextstrain/cli/pull/345)) + ## Bug fixes * Commands making use of user authentication (e.g. `nextstrain login`,