Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@ development source code and as such may not be routinely kept up to date.

## Improvements

* `nextstrain run` supports running workflows with defined Snakefiles and
configfiles in the `nextstrain-pathogen.yaml` file. This is mainly relevant
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(non-blocking) There's no documentation for the nextstrain-pathogen.yaml schema yet, but it might be good to add soon.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For sure, I was going to add the schema next. I'm thinking I'll stick with jsonschema that we already use for Augur.

for maintainers for pathogens and does not affect users of `nextstrain run`.
([#462](https://github.com/nextstrain/cli/pull/462))

* `nextstrain setup <pathogen>` and `nextstrain version --pathogens` now list
the available workflows for a pathogen if the pathogen lists the workflows
in the top-level `nextstrain-pathogen.yaml` file.
([#461](https://github.com/nextstrain/cli/pull/461))
([#461](https://github.com/nextstrain/cli/pull/461), [#472](https://github.com/nextstrain/cli/pull/472))

* Snakemake's storage support downloaded files (stored in `.snakemake/storage/`)
are now downloaded from AWS Batch builds by default.
Expand Down
7 changes: 6 additions & 1 deletion doc/changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,15 @@ development source code and as such may not be routinely kept up to date.
(v-next-improvements)=
### Improvements

* `nextstrain run` supports running workflows with defined Snakefiles and
configfiles in the `nextstrain-pathogen.yaml` file. This is mainly relevant
for maintainers for pathogens and does not affect users of `nextstrain run`.
([#462](https://github.com/nextstrain/cli/pull/462))

* `nextstrain setup <pathogen>` and `nextstrain version --pathogens` now list
the available workflows for a pathogen if the pathogen lists the workflows
in the top-level `nextstrain-pathogen.yaml` file.
([#461](https://github.com/nextstrain/cli/pull/461))
([#461](https://github.com/nextstrain/cli/pull/461), [#472](https://github.com/nextstrain/cli/pull/472))

* Snakemake's storage support downloaded files (stored in `.snakemake/storage/`)
are now downloaded from AWS Batch builds by default.
Expand Down
48 changes: 38 additions & 10 deletions nextstrain/cli/command/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,30 +228,54 @@ def run(opts):
# Resolve pathogen and workflow names to a local workflow directory.
pathogen = PathogenVersion(opts.pathogen)

if opts.workflow not in pathogen.registered_workflows():
if opts.workflow not in pathogen.compatible_workflows("nextstrain run"):
print(f"The {opts.workflow!r} workflow is not registered as a compatible workflow, but trying to run anyways.")

workflow_directory = pathogen.workflow_path(opts.workflow)
workflow_files = pathogen.workflow_files(opts.workflow)
workflow_snakefile = workflow_files["snakefile"]

if not workflow_directory.is_dir() or not (workflow_directory / "Snakefile").is_file():
if not workflow_snakefile.is_file():
raise UserError(f"""
No {opts.workflow!r} workflow for pathogen {opts.pathogen!r} found {f"in {str(workflow_directory)!r}" if DEBUGGING else "locally"}.
No {opts.workflow!r} workflow for pathogen {opts.pathogen!r} found {f"(Snakefile {workflow_snakefile!r} does not exist)" if DEBUGGING else "locally"}.

Maybe you need to update to a newer version of the pathogen?

Hint: to update the pathogen, run `nextstrain update {shquote(pathogen.name)}`.
""")

if workflow_configfile := workflow_files["configfile"]:
assert workflow_configfile.is_file(), \
f"Workflow's registered config file {workflow_configfile!r} does not exist."

# The pathogen volume is the pathogen directory (i.e. repo).
# The workflow volume is the workflow directory within the pathogen directory.
# The build volume is the user's analysis directory and will be the working directory.
pathogen_volume, workflow_volume = build.pathogen_volumes(workflow_directory, name = "pathogen")
pathogen_volume, _ = build.pathogen_volumes(pathogen.path, name = "pathogen")
build_volume = NamedVolume("build", opts.analysis_directory)

# for containerized runtimes (e.g. Docker, Singularity, and AWS Batch)
opts.volumes.append(pathogen_volume)
opts.volumes.append(build_volume)

# Resolve paths for workflow files
resolved_pathogen = (
docker.mount_point(pathogen_volume)
if opts.__runner__ in {docker, singularity, aws_batch} else
pathogen_volume.src.resolve(strict = True)
)
resolved_snakefile = resolved_pathogen / workflow_snakefile.relative_to(pathogen.path)
resolved_configfile = None
if workflow_configfile:
resolved_configfile = resolved_pathogen / workflow_configfile.relative_to(pathogen.path)

resolved_overlay = None
if (opts.analysis_directory / "config.yaml").is_file():
resolved_build = (
docker.mount_point(build_volume)
if opts.__runner__ in {docker, singularity, aws_batch} else
build_volume.src.resolve(strict = True)
)
resolved_overlay = resolved_build / "config.yaml"

print(f"Running the {opts.workflow!r} workflow for pathogen {pathogen}")

# Set up Snakemake invocation.
Expand All @@ -276,11 +300,15 @@ def run(opts):

# Workdir will be the analysis volume (/nextstrain/build in a
# containerized runtime), so explicitly point to the Snakefile.
"--snakefile=%s/Snakefile" % (
docker.mount_point(workflow_volume)
if opts.__runner__ in {docker, singularity, aws_batch} else
workflow_volume.src.resolve(strict = True)),
"--snakefile=%s" % (resolved_snakefile),

*(["--configfile=%s" % (resolved_configfile)]
if resolved_configfile else []),
Comment on lines +305 to +306
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OH, just realized this overrides the custom config in the analysis directory...will think on this...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I remember it took me a bit of digging figure out snakemake's config handling. I was alluding to this here, but a much more thorough comment is this one from the original avian-flu PR. It all makes sense once you figure it out, but it's confusing before that because configfile: does more than you (I!) thought it does.

Copy link
Member

@jameshadfield jameshadfield Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Put another way, I think --configfile base overlay or

configfile: base
configfile: overlay

or configfile: base + --configfile overlay will all work; thinking through how the third example actually works, and what that says about what configfile: is doing behind the scenes was the 💡 moment for me. (The second example is how we run external analysis directories with a default config.yaml overlay, and the third is the documented way to run external analysis directories with a differently-named overlay YAML.) But --configfile base plus configfile: overlay doesn't work, and I don't think it can be made to work in a nice way. (I think that's what this PR is trying to do, but maybe not?)

As a thought experiment, I think the runner (via nextstrain-pathogen.yaml) could define --config base_configfile: base and then in the snakefile we leverage that to do something like

if 'base_configfile' in config:
  configfile: path.join(repo_directory, config['base_configfile' ])

but I'm sure there's a nicer way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah thanks for the pointers @jameshadfield!

I was reading through the Snakemake config docs again and noticed that there is also an API method for overwriting configs (snakemake.utils.update_config). As another experiment, I thought we could replace how we incorporate the overlay config:

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 57a9da9474..399e6eef9f 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -16,7 +16,9 @@
 
 # Use custom configuration from analysis directory (i.e. working dir), if any.
 if os.path.exists("config.yaml"):
-    configfile: "config.yaml"
+    with open("config.yaml", "r", encoding = "utf-8") as f:
+        custom_config = yaml.safe_load(f)
+        snakemake.utils.update_config(config, custom_config)

Unfortunately, this means the overlay also overrides any --config, which is not an issue for nextstrain run, but is unexpected behavior for directly running snakemake or nextstrain build...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a thought experiment, I think the runner (via nextstrain-pathogen.yaml) could define --config base_configfile: base and then in the snakefile we leverage that to do something like

if 'base_configfile' in config:
  configfile: path.join(repo_directory, config['base_configfile' ])

Tried this out locally and it works as expected if it's defined before configfile: overlay. So for our workflows the config merging order will be

--config base_configfile=... < configfile: overlay < --configfile < --config

Not sure how I feel about that special case for base_configfile yet...

Copy link
Member

@jameshadfield jameshadfield Jul 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that seems... fine. I think! It's not nextstrain run specific per-se as you can run via snakemake --configfile base overlay pretty easily.

And this should work nicely with custom-named config overlays as you can add --configfile overlay.yaml (if nextstrain run allows you to add your own snakemake args?!?)

Would you follow this up with the removal of lines such as these in measles?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not nextstrain run specific per-se as you can run via snakemake --configfile base overlay pretty easily.

Ah, I meant implemented specifically for nextstrain run here since this change shouldn't affect how workflows run for snakemake or nextstrain build calls.

And this should work nicely with custom-named config overlays as you can add --configfile overlay.yaml (if nextstrain run allows you to add your own snakemake args?!?)

As of right now, nextstrain run does not allow you to pass in additional snakemake CLI args (maybe it would work with a default Snakemake profile?)

Would you follow this up with the removal of lines such as these in measles?

I think so? Unless we expect the config.yaml in the analysis directory to get automatically used when running with snakemake or nextstrain build.

Copy link
Member

@jameshadfield jameshadfield Jul 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unless we expect the config.yaml in the analysis directory to get automatically used when running with snakemake or nextstrain build.

We have two different scenarios:

  1. There's clearly a default configfile (e.g. measles, avian-flu)
  2. There's no one default, you have to select it somehow (e.g. mpox)

Outside of nextstrain run I'm not sure of the best behaviour for (1) - should you be forced to supply the default --configfile yourself, or can the Snakefile (or config.smk etc) do that for you as it currently does. (Note that you already have to supply --snakefile ....) I guess consistency across (1) and (2) would be nice.

Copy link
Contributor Author

@joverlee521 joverlee521 Jul 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, probably best to write out how all of the different methods of running the workflow will look...

For (1) - a pathogen that does have a Snakemake code to load default config and automatically use the analysis config overlay such as measles, the commands are pretty simple:

snakemake \
    --snakefile phylogenetic/Snakefile \
    --directory /tmp/measles-phylo
# Hypothetical custom build config within phylogenetic/build-configs/overlay
nextstrain build phylogenetic \
    --directory build-configs/overlay/ \

The nextstrain-pathogen.yaml will be simple for nextstrain run since it does not need to define the snakefile or config files:

---
compatibility:
  nextstrain run:
    ingest: ~
    phylogenetic: ~
    nextclade: ~

For (2) - a pathogen without a default config, the automatic use of analysis config overlay does not work as expected. There will be no Snakemake code to define configfile, so we must always specify both the base config and overlay via the --configfile option:

snakemake \
    --snakefile phylogenetic/Snakefile \
    --directory /tmp/mpox-phylo-clade-i \
    --configfile phylogenetic/defaults/clade-i/config.yaml \
    --configfile /tmp/mpox-phylo-clade-i/config.yaml
# Hypothetical custom build config within phylogenetic/build-configs/overlay
nextstrain build phylogenetic \
    --directory build-configs/overlay/ \
    --configfile defaults/clade-i/config.yaml \
    --configfile build-configs/overlay/config.yaml

The nextstrain-pathogen.yaml can will define the snakefile and configfiles for nextstrain run:

---
compatibility:
  nextstrain run:
    ingest: ~
    phylogenetic/all-clades:
        snakefile: phylogenetic/Snakefile
        configfile: phylogenetic/defaults/mpxv/config.yaml
    phylogenetic/clade-I:
        snakefile: phylogenetic/Snakefile
        configfile: phylogenetic/defaults/clade-i/config.yaml
    phylogenetic/clade-IIb:
        snakefile: phylogenetic/Snakefile
        configfile: phylogenetic/defaults/hmpxv1/config.yaml
    phylogenetic/lineage-B.1:
        snakefile: phylogenetic/Snakefile
        configfile: phylogenetic/defaults/hmpxv1_big/config.yaml

If we remove the default base config and automatic use of analysis config overlays, then the commands for measles would be more verbose and consistent with mpox. However, after writing this out, I'm not sure that consistency is worth it, especially since the --configfile for the base config will always be the same for measles...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Can you add to that comment what the nextstrain-pathogen.yaml would look like and if the workflow uses snakemake code to load the default/config overlays. I'm guessing measles has a YAML which doesn't define any configfiles and does have snakemake code like this, and (2) is the opposite?


# Ensure the overlay config in the user's analysis directory
# overrides any default config file provided above.
*(["--configfile=%s" % (resolved_overlay)]
if resolved_overlay else []),
# Pass thru appropriate resource options.
#
# Snakemake requires the --cores option as of 5.11, so provide a
Expand Down
6 changes: 3 additions & 3 deletions nextstrain/cli/command/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def run(opts):
if opts.verbose:
print(" " + str(version.path))

if registered_workflows := version.registered_workflows():
print(" " + "Available workflows:")
for workflow in registered_workflows:
if compatible_workflows := version.compatible_workflows("nextstrain run"):
print(" " + "`nextstrain run` compatible workflows:")
for workflow in compatible_workflows:
print(" " + workflow)
else:
print(" " + "No workflows listed, please refer to pathogen docs.")
Expand Down
51 changes: 41 additions & 10 deletions nextstrain/cli/pathogens.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,24 +308,56 @@ def __init__(self, name_version_url: str, new_setup: bool = False):
def registered_workflows(self) -> Dict[str, Dict]:
"""
Parses :attr:`.registration` to return a dict of registered
compatible workflows, where the keys are workflow names.
workflows, where the keys are workflow names.
"""
if self.registration is None:
debug("pathogen does not have a registration")
return {}

workflows = self.registration.get("compatibility", {}).get("nextstrain run")
workflows = self.registration.get("workflows")
if not isinstance(workflows, dict):
debug(f"pathogen registration.compatibility['nextstrain runs'] is not a dict (got a {type(workflows).__name__})")
debug(f"pathogen registration.workflows is not a dict (got a {type(workflows).__name__})")
return {}

return workflows


def compatible_workflows(self, feature: str) -> Dict[str, Dict]:
"""
Parses registered workflows to return a subset of workflows that are
compatible with the provided *feature*.
"""
return {
workflow: workflow_config
for workflow, workflow_config in self.registered_workflows().items()
if workflow_config.get("compatibility", {}).get(feature, False)
}


def workflow_path(self, workflow: str) -> Path:
return self.path / workflow


def workflow_files(self, workflow: str) -> Dict:
"""
Parses :attr:`.registration` to get the path to a *workflow* files,
snakefile and configfile.
"""
files = {
"snakefile": self.workflow_path(workflow) / "Snakefile",
"configfile": None,
}

if workflow_registration := self.registered_workflows().get(workflow):
if snakefile := workflow_registration.get("snakefile"):
files["snakefile"] = self.path / snakefile

if configfile := workflow_registration.get("configfile"):
files["configfile"] = self.path / configfile

return files


def setup(self, dry_run: bool = False, force: bool = False) -> SetupStatus:
"""
Downloads and installs this pathogen version from :attr:`.url`.
Expand Down Expand Up @@ -481,20 +513,19 @@ def test_compatibility() -> SetupTestResult:
if self.registration is None:
return msg + "\n(couldn't read registration)", False

if compatible_workflows := self.compatible_workflows("nextstrain run"):
return msg + f"\nCompatible workflows: {list(compatible_workflows.keys())}", True

# If no compatible workflows are listed, then check for the top level
# boolean compatibility declaration
try:
compatibility = self.registration["compatibility"]["nextstrain run"]
except (KeyError, IndexError, TypeError):
if DEBUGGING:
traceback.print_exc()
return msg + "\n(couldn't find 'compatibility: nextstrain run: …' field)", False

if compatibility:
if workflows := self.registered_workflows():
msg += f"\nAvailable workflows: {list(workflows.keys())}"
else:
msg += f"\nNo workflows listed, please refer to pathogen docs."

return msg, bool(compatibility)
return msg + "\nNo compatible workflows listed, please refer to pathogen docs.", bool(compatibility)

return [
('downloaded',
Expand Down
Loading