From e701c248ac8926198728cce9b0bef72183f56eea Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 1 Aug 2025 16:20:33 -0700 Subject: [PATCH 1/4] pathogens: use top level `workflows` key List workflows under a top level `workflows` key in the `nextstrain-pathogen.yaml` file where each workflow has it's own `compatibility` key: ```yaml workflows: ingest: compatibility: nextstrain run: True phylogenetic: compatibility: nextstrain run: True ``` The top level `compatibility['nextstrain run']` boolean is still supported for backwards compatibility. From discussion with @victorlin in --- CHANGES.md | 2 +- doc/changes.md | 2 +- nextstrain/cli/command/run.py | 2 +- nextstrain/cli/command/version.py | 6 +++--- nextstrain/cli/pathogens.py | 31 +++++++++++++++++++++---------- 5 files changed, 27 insertions(+), 16 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 65b4b136..481a9fad 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -18,7 +18,7 @@ development source code and as such may not be routinely kept up to date. * `nextstrain setup ` and `nextstrain version --pathogens` now list the available workflows for a pathogen if the pathogen lists the workflows in the top-level `nextstrain-pathogen.yaml` file. - ([#461](https://github.com/nextstrain/cli/pull/461)) + ([#461](https://github.com/nextstrain/cli/pull/461), [#472](https://github.com/nextstrain/cli/pull/472)) * Snakemake's storage support downloaded files (stored in `.snakemake/storage/`) are now downloaded from AWS Batch builds by default. diff --git a/doc/changes.md b/doc/changes.md index 1a009e3a..6c8a50ae 100644 --- a/doc/changes.md +++ b/doc/changes.md @@ -22,7 +22,7 @@ development source code and as such may not be routinely kept up to date. * `nextstrain setup ` and `nextstrain version --pathogens` now list the available workflows for a pathogen if the pathogen lists the workflows in the top-level `nextstrain-pathogen.yaml` file. - ([#461](https://github.com/nextstrain/cli/pull/461)) + ([#461](https://github.com/nextstrain/cli/pull/461), [#472](https://github.com/nextstrain/cli/pull/472)) * Snakemake's storage support downloaded files (stored in `.snakemake/storage/`) are now downloaded from AWS Batch builds by default. diff --git a/nextstrain/cli/command/run.py b/nextstrain/cli/command/run.py index d278fb95..3d5f7e8c 100644 --- a/nextstrain/cli/command/run.py +++ b/nextstrain/cli/command/run.py @@ -228,7 +228,7 @@ def run(opts): # Resolve pathogen and workflow names to a local workflow directory. pathogen = PathogenVersion(opts.pathogen) - if opts.workflow not in pathogen.registered_workflows(): + if opts.workflow not in pathogen.compatible_workflows("nextstrain run"): print(f"The {opts.workflow!r} workflow is not registered as a compatible workflow, but trying to run anyways.") workflow_directory = pathogen.workflow_path(opts.workflow) diff --git a/nextstrain/cli/command/version.py b/nextstrain/cli/command/version.py index 4ab8a925..1c0a8db0 100644 --- a/nextstrain/cli/command/version.py +++ b/nextstrain/cli/command/version.py @@ -72,9 +72,9 @@ def run(opts): if opts.verbose: print(" " + str(version.path)) - if registered_workflows := version.registered_workflows(): - print(" " + "Available workflows:") - for workflow in registered_workflows: + if compatible_workflows := version.compatible_workflows("nextstrain run"): + print(" " + "`nextstrain run` compatible workflows:") + for workflow in compatible_workflows: print(" " + workflow) else: print(" " + "No workflows listed, please refer to pathogen docs.") diff --git a/nextstrain/cli/pathogens.py b/nextstrain/cli/pathogens.py index 7afbdc49..a104a6d3 100644 --- a/nextstrain/cli/pathogens.py +++ b/nextstrain/cli/pathogens.py @@ -308,20 +308,32 @@ def __init__(self, name_version_url: str, new_setup: bool = False): def registered_workflows(self) -> Dict[str, Dict]: """ Parses :attr:`.registration` to return a dict of registered - compatible workflows, where the keys are workflow names. + workflows, where the keys are workflow names. """ if self.registration is None: debug("pathogen does not have a registration") return {} - workflows = self.registration.get("compatibility", {}).get("nextstrain run") + workflows = self.registration.get("workflows") if not isinstance(workflows, dict): - debug(f"pathogen registration.compatibility['nextstrain runs'] is not a dict (got a {type(workflows).__name__})") + debug(f"pathogen registration.workflows is not a dict (got a {type(workflows).__name__})") return {} return workflows + def compatible_workflows(self, feature: str) -> Dict[str, Dict]: + """ + Parses registered workflows to return a subset of workflows that are + compatible with the provided *feature*. + """ + return { + workflow: workflow_config + for workflow, workflow_config in self.registered_workflows().items() + if workflow_config.get("compatibility", {}).get(feature, False) + } + + def workflow_path(self, workflow: str) -> Path: return self.path / workflow @@ -481,6 +493,11 @@ def test_compatibility() -> SetupTestResult: if self.registration is None: return msg + "\n(couldn't read registration)", False + if compatible_workflows := self.compatible_workflows("nextstrain run"): + return msg + f"\nCompatible workflows: {list(compatible_workflows.keys())}", True + + # If no compatible workflows are listed, then check for the top level + # boolean compatibility declaration try: compatibility = self.registration["compatibility"]["nextstrain run"] except (KeyError, IndexError, TypeError): @@ -488,13 +505,7 @@ def test_compatibility() -> SetupTestResult: traceback.print_exc() return msg + "\n(couldn't find 'compatibility: nextstrain run: …' field)", False - if compatibility: - if workflows := self.registered_workflows(): - msg += f"\nAvailable workflows: {list(workflows.keys())}" - else: - msg += f"\nNo workflows listed, please refer to pathogen docs." - - return msg, bool(compatibility) + return msg + "\nNo compatible workflows listed, please refer to pathogen docs.", bool(compatibility) return [ ('downloaded', From 582163913461c29c8aa10d1928f5287175ca2d0b Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 21 Jul 2025 17:15:44 -0700 Subject: [PATCH 2/4] run: support registered workflow Snakefile and configfile Allow maintainers to define paths to each workflow's Snakefile and configfile in the nextstrain-pathogen.yaml. Using mpox as an example, this allows the named workflows to use the shared Snakefile and their own custom configfiles without needing to create separate Snakefiles for each workflow. ``` --- compatibility: nextstrain run: ingest: ~ phylogenetic/all-clades: snakefile: phylogenetic/Snakefile configfile: phylogenetic/defaults/mpxv/config.yaml phylogenetic/clade-I: snakefile: phylogenetic/Snakefile configfile: phylogenetic/defaults/clade-i/config.yaml phylogenetic/clade-IIb: snakefile: phylogenetic/Snakefile configfile: phylogenetic/defaults/hmpxv1/config.yaml phylogenetic/lineage-B.1: snakefile: phylogenetic/Snakefile configfile: phylogenetic/deafults/hmpxv1_big/config.yaml ``` --- nextstrain/cli/command/run.py | 33 ++++++++++++++++++++++++--------- nextstrain/cli/pathogens.py | 20 ++++++++++++++++++++ 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/nextstrain/cli/command/run.py b/nextstrain/cli/command/run.py index 3d5f7e8c..cb886382 100644 --- a/nextstrain/cli/command/run.py +++ b/nextstrain/cli/command/run.py @@ -231,27 +231,42 @@ def run(opts): if opts.workflow not in pathogen.compatible_workflows("nextstrain run"): print(f"The {opts.workflow!r} workflow is not registered as a compatible workflow, but trying to run anyways.") - workflow_directory = pathogen.workflow_path(opts.workflow) + workflow_files = pathogen.workflow_files(opts.workflow) + workflow_snakefile = workflow_files["snakefile"] - if not workflow_directory.is_dir() or not (workflow_directory / "Snakefile").is_file(): + if not workflow_snakefile.is_file(): raise UserError(f""" - No {opts.workflow!r} workflow for pathogen {opts.pathogen!r} found {f"in {str(workflow_directory)!r}" if DEBUGGING else "locally"}. + No {opts.workflow!r} workflow for pathogen {opts.pathogen!r} found {f"(Snakefile {workflow_snakefile!r} does not exist)" if DEBUGGING else "locally"}. Maybe you need to update to a newer version of the pathogen? Hint: to update the pathogen, run `nextstrain update {shquote(pathogen.name)}`. """) + if workflow_configfile := workflow_files["configfile"]: + assert workflow_configfile.is_file(), \ + f"Workflow's registered config file {workflow_configfile!r} does not exist." + # The pathogen volume is the pathogen directory (i.e. repo). - # The workflow volume is the workflow directory within the pathogen directory. # The build volume is the user's analysis directory and will be the working directory. - pathogen_volume, workflow_volume = build.pathogen_volumes(workflow_directory, name = "pathogen") + pathogen_volume, _ = build.pathogen_volumes(pathogen.path, name = "pathogen") build_volume = NamedVolume("build", opts.analysis_directory) # for containerized runtimes (e.g. Docker, Singularity, and AWS Batch) opts.volumes.append(pathogen_volume) opts.volumes.append(build_volume) + # Resolve paths for workflow files + resolved_pathogen = ( + docker.mount_point(pathogen_volume) + if opts.__runner__ in {docker, singularity, aws_batch} else + pathogen_volume.src.resolve(strict = True) + ) + resolved_snakefile = resolved_pathogen / workflow_snakefile.relative_to(pathogen.path) + resolved_configfile = None + if workflow_configfile: + resolved_configfile = resolved_pathogen / workflow_configfile.relative_to(pathogen.path) + print(f"Running the {opts.workflow!r} workflow for pathogen {pathogen}") # Set up Snakemake invocation. @@ -276,10 +291,10 @@ def run(opts): # Workdir will be the analysis volume (/nextstrain/build in a # containerized runtime), so explicitly point to the Snakefile. - "--snakefile=%s/Snakefile" % ( - docker.mount_point(workflow_volume) - if opts.__runner__ in {docker, singularity, aws_batch} else - workflow_volume.src.resolve(strict = True)), + "--snakefile=%s" % (resolved_snakefile), + + *(["--configfile=%s" % (resolved_configfile)] + if resolved_configfile else []), # Pass thru appropriate resource options. # diff --git a/nextstrain/cli/pathogens.py b/nextstrain/cli/pathogens.py index a104a6d3..d8fd5cb2 100644 --- a/nextstrain/cli/pathogens.py +++ b/nextstrain/cli/pathogens.py @@ -338,6 +338,26 @@ def workflow_path(self, workflow: str) -> Path: return self.path / workflow + def workflow_files(self, workflow: str) -> Dict: + """ + Parses :attr:`.registration` to get the path to a *workflow* files, + snakefile and configfile. + """ + files = { + "snakefile": self.workflow_path(workflow) / "Snakefile", + "configfile": None, + } + + if workflow_registration := self.registered_workflows().get(workflow): + if snakefile := workflow_registration.get("snakefile"): + files["snakefile"] = self.path / snakefile + + if configfile := workflow_registration.get("configfile"): + files["configfile"] = self.path / configfile + + return files + + def setup(self, dry_run: bool = False, force: bool = False) -> SetupStatus: """ Downloads and installs this pathogen version from :attr:`.url`. From 38ecf54cd849b919be377958d2a10dab18164bc6 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 24 Jul 2025 13:41:15 -0700 Subject: [PATCH 3/4] run: ensure /config.yaml overrides workflow config Because of the order in which Snakemake merges configs, we must also provide the user's config from their analysis directory to override workflow configs provided via the `--configfile` option. See detailed discussion in --- nextstrain/cli/command/run.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/nextstrain/cli/command/run.py b/nextstrain/cli/command/run.py index cb886382..fe6e7e17 100644 --- a/nextstrain/cli/command/run.py +++ b/nextstrain/cli/command/run.py @@ -267,6 +267,15 @@ def run(opts): if workflow_configfile: resolved_configfile = resolved_pathogen / workflow_configfile.relative_to(pathogen.path) + resolved_overlay = None + if (opts.analysis_directory / "config.yaml").is_file(): + resolved_build = ( + docker.mount_point(build_volume) + if opts.__runner__ in {docker, singularity, aws_batch} else + build_volume.src.resolve(strict = True) + ) + resolved_overlay = resolved_build / "config.yaml" + print(f"Running the {opts.workflow!r} workflow for pathogen {pathogen}") # Set up Snakemake invocation. @@ -296,6 +305,10 @@ def run(opts): *(["--configfile=%s" % (resolved_configfile)] if resolved_configfile else []), + # Ensure the overlay config in the user's analysis directory + # overrides any default config file provided above. + *(["--configfile=%s" % (resolved_overlay)] + if resolved_overlay else []), # Pass thru appropriate resource options. # # Snakemake requires the --cores option as of 5.11, so provide a From 1e4a963f5aac3079d40ab03bdaea65eeb5250e6e Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Thu, 24 Jul 2025 13:41:15 -0700 Subject: [PATCH 4/4] Update changelog --- CHANGES.md | 5 +++++ doc/changes.md | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 481a9fad..8017a72f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,6 +15,11 @@ development source code and as such may not be routinely kept up to date. ## Improvements +* `nextstrain run` supports running workflows with defined Snakefiles and + configfiles in the `nextstrain-pathogen.yaml` file. This is mainly relevant + for maintainers for pathogens and does not affect users of `nextstrain run`. + ([#462](https://github.com/nextstrain/cli/pull/462)) + * `nextstrain setup ` and `nextstrain version --pathogens` now list the available workflows for a pathogen if the pathogen lists the workflows in the top-level `nextstrain-pathogen.yaml` file. diff --git a/doc/changes.md b/doc/changes.md index 6c8a50ae..c9e2a839 100644 --- a/doc/changes.md +++ b/doc/changes.md @@ -19,6 +19,11 @@ development source code and as such may not be routinely kept up to date. (v-next-improvements)= ### Improvements +* `nextstrain run` supports running workflows with defined Snakefiles and + configfiles in the `nextstrain-pathogen.yaml` file. This is mainly relevant + for maintainers for pathogens and does not affect users of `nextstrain run`. + ([#462](https://github.com/nextstrain/cli/pull/462)) + * `nextstrain setup ` and `nextstrain version --pathogens` now list the available workflows for a pathogen if the pathogen lists the workflows in the top-level `nextstrain-pathogen.yaml` file.