nf-core · SkyLexS · Dec 12, 2025 · Mar 3, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/modules/nf-core/bigslice/prep_input/enviroment.yml b/modules/nf-core/bigslice/prep_input/enviroment.yml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::bigslice=2.0.0
+  - conda-forge::biopython=1.81
+  - conda-forge::pandas=2.0.3
diff --git a/modules/nf-core/bigslice/prep_input/main.nf b/modules/nf-core/bigslice/prep_input/main.nf
@@ -0,0 +1,84 @@
+/*
+ * BIGSLICE_PREP_INPUT: Prepares antiSMASH outputs for BiG-SLiCE analysis
+ *
+ * This process transforms antiSMASH output directories into the specific
+ * directory structure and file format that BiG-SLiCE expects for clustering
+ * biosynthetic gene clusters (BGCs).
+ *
+ * BiG-SLiCE input structure:
+ * input/
+ * ├── datasets.tsv              # dataset configuration file
+ * ├── <dataset_name>/           # GBK files organized by sample (each GBK contains a BGC region)
+ * │   ├── sample1/
+ * │   │   ├── region001.gbk     # BGC region 1 in GenBank format
+ * │   │   └── region002.gbk     # BGC region 2 in GenBank format
+ * │   └── sample2/
+ * │       └── region001.gbk     # BGC region 1 in GenBank format
+ * └── taxonomy/
+ *     └── dataset_taxonomy.tsv  # taxonomic information (9-column format)
+ */
+process BIGSLICE_PREP_INPUT {
+  label 'bigslice'
-  label 'bigslice'
+  label '${meta.id}'
-  label 'bigslice'
+  label '${meta.id}'
+  tag "dataset=${params.bgc_bigslice_dataset_name}"
-  tag "dataset=${params.bgc_bigslice_dataset_name}"
+  tag "${meta.id}"
-  tag "dataset=${params.bgc_bigslice_dataset_name}"
+  tag "${meta.id}"
+
+  conda "${moduleDir}/environment.yml"
+container "${ workflow.containerEngine == 'singularity' && !task. ext.singularity_pull_docker_container ?
+    'https://depot.galaxyproject.org/singularity/bigslice:2.0.2--pyh8ed023e_0':
+    'quay.io/biocontainers/bigslice:2.0.2--pyh8ed023e_0' }"
+
+  input:
+  // list of antiSMASH output directories (one per sample)
+  path antismash_dirs
-  path antismash_dirs
+  tuple val(meta), path(antismash_dirs)
-  path antismash_dirs
+  tuple val(meta), path(antismash_dirs)
+
+  output:
+  // complete "input" folder structure for BiG-SLiCE (contains dataset/, taxonomy/, datasets.tsv)
+  path "input", emit: input_dir
+  path "versions.yml", emit: versions
+
+  script:
+  // prepare quoted directory list for bash for-loop processing
+  def quoted = antismash_dirs.collect { "\"${it}\"" }.join(' ')
+  def VERSION = '2.0.2'
+  """
+  set -euo pipefail
+
+  ROOT="input"                              # BiG-SLiCE input root directory
+  DS="${params.bgc_bigslice_dataset_name}"      # dataset name (e.g., 'antismash')
+  OUT="\$ROOT/\$DS"                         # dataset-specific output directory
+  TAXROOT="\$ROOT/taxonomy"                 # taxonomy directory
+
+  rm -rf "\$ROOT"
+  mkdir -p "\$OUT" "\$TAXROOT"
+
+  for d in ${quoted}; do
+    [ -d "\$d" ] || continue                # skip if directory doesn't exist
+    sample=\$(basename "\$d")               # extract sample name from directory path
+    mkdir -p "\$OUT/\$sample"               # create sample-specific subdirectory
+
+    find -L "\$d" -type f \\( -name "*.region*.gbk" -o -name "*.gbk" \\) -print0 \
+      | xargs -0 -I{} cp -f "{}" "\$OUT/\$sample/"
+  done
+
+  if [ -n "${params.bgc_bigslice_taxonomy ?: ''}" ]; then
+    cp "${params.bgc_bigslice_taxonomy}" "\$TAXROOT/dataset_taxonomy.tsv"
+  else
+    printf "accession\\ttaxdomain\\tphylum\\tclass\\torder\\tfamily\\tgenus\\tspecies\\torganism\\n" > "\$TAXROOT/dataset_taxonomy.tsv"
+
+    for d in "\$OUT"/*/; do
+      [ -d "\$d" ] || continue
+      acc=\$(basename "\$d")/                # sample accession (with trailing slash as per BiG-SLiCE format)
+      printf "%s\\tUnknown\\tUnknown\\tUnknown\\tUnknown\\tUnknown\\tUnknown\\tUnknown\\tUnknown\\n" "\$acc" >> "\$TAXROOT/dataset_taxonomy.tsv"
+    done
+  fi
+
+  {
+    echo "# dataset_name\\tdataset_path\\ttaxonomy_path\\tdescription"
+    printf "%s\\t%s\\t%s\\t%s\\n" "\$DS" "\$DS" "taxonomy/dataset_taxonomy.tsv" "antiSMASH \$DS"
+  } > "\$ROOT/datasets.tsv"
+
+    cat <<-END_VERSIONS > versions.yml
+  "${task.process}":
+      bigslice: \$(bigslice --version 2>&1 | grep -oP 'BiG-SLiCE \\K[0-9.]+' || echo "${VERSION}")
+  END_VERSIONS
+  """
+}
diff --git a/modules/nf-core/bigslice/prep_input/meta.yml b/modules/nf-core/bigslice/prep_input/meta.yml
@@ -0,0 +1,37 @@
+name: bigslice_prep_input
+description: Prepares antiSMASH outputs for BiG-SLiCE clustering analysis by organizing BGC GenBank files into the required directory structure
+keywords:
+  - biosynthetic gene cluster
+  - BGC
+  - antismash
+  - bigslice
+  - data preparation
+  - genome mining
+
+tools:
+  - bigslice:
+      description: A highly scalable, user-interactive tool for the large scale analysis of Biosynthetic Gene Clusters data
+      homepage: https://github.com/medema-group/BiG-SLiCE
+      documentation: https://github.com/medema-group/BiG-SLiCE/wiki
+      tool_dev_url: https://github.com/medema-group/BiG-SLiCE
+      doi: "10.1038/s41467-021-21428-1"
+      licence: ["AGPL v3"]
+
+input:
+  - antismash_dirs:
+      type: directory
+      description: |
+        List of antiSMASH output directories, one per sample.
+        Each directory should contain *.region*.gbk files representing
+        detected biosynthetic gene clusters.
+      pattern: "*/"
+
+output:
+  - input_dir:
+      type: directory
+      description: |
+        Structured input directory for BiG-SLiCE containing:
+        - datasets.tsv: dataset configuration file
+        - <dataset_name>/: organized BGC GenBank files by sample
+        - taxonomy/: taxonomic information in GTDB format
+      pattern: "input/"
diff --git a/modules/nf-core/bigslice/run/enviroment.yml b/modules/nf-core/bigslice/run/enviroment.yml
@@ -0,0 +1,12 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::bigslice=2.0.0
+  - conda-forge::scikit-learn=1.3.0
+  - conda-forge::numpy=1.24.3
+  - conda-forge::pandas=2.0.3
+  - conda-forge::biopython=1.81
+  - conda-forge::matplotlib-base=3.7.2
+  - conda-forge::flask=2.3.3
diff --git a/modules/nf-core/bigslice/run/main.nf b/modules/nf-core/bigslice/run/main.nf
@@ -0,0 +1,59 @@
+/*
+ * BIGSLICE_RUN: Executes BiG-SLiCE clustering analysis on prepared BGC data
+ *
+ * BiG-SLiCE (Biosynthetic Gene cluster - Super Linear Clustering Engine) is a tool
+ * for rapid clustering and analysis of biosynthetic gene clusters from genomic data.
+ *
+ * This process:
+ * 1. Takes the structured input directory created by BIGSLICE_PREP_INPUT
+ * 2. Uses pre-trained machine learning models to analyze BGC features
+ * 3. Performs hierarchical clustering of BGCs based on genetic similarity
+ * 4. Generates interactive web application and detailed clustering results
+ *
+ * Output structure:
+ * output/
+ * ├── app/                           # interactive web application for visualization
+ * ├── result/                        # clustering results and analysis data
+ * ├── LICENSE.txt                    # BiG-SLiCE license information
+ * ├── requirements.txt               # Python dependencies for the web app
+ * └── start_server.sh               # script to launch the interactive web server
+ */
+process BIGSLICE_RUN {
+  label 'bigslice'
+
+  conda "${moduleDir}/enviroment.yml"
+container "${ workflow.containerEngine == 'singularity' && !task. ext.singularity_pull_docker_container ?
+    'https://depot.galaxyproject.org/singularity/bigslice:2.0.2--pyh8ed023e_0':
+    'quay.io/biocontainers/bigslice:2.0.2--pyh8ed023e_0' }"
+
+  input:
+  path input_dir    // structured input directory from BIGSLICE_PREP_INPUT (contains datasets.tsv, BGC files, taxonomy)
+  path models_dir   // pre-trained BiG-SLiCE models directory (e.g., bigslice-models.2022-11-30)
+
+  output:
+  path "output", emit: outdir   // complete BiG-SLiCE output directory with clustering results and reports
+  path "versions.yml", emit: versions
+
+  script:
+  def VERSION = '2.0.2'
+  """
+  set -euo pipefail
+
+  # clean any existing output directory to ensure fresh results
+  rm -rf output 2>/dev/null || true
+
+  # execute BiG-SLiCE clustering analysis
+  # -i: input directory containing prepared BGC data and configuration
+  # --program_db_folder: directory with pre-trained machine learning models
+  # output: destination directory for all results and reports
+  bigslice \
+    -i "${input_dir}" \
+    --program_db_folder "${models_dir}" \
+    output
+
+  cat <<-END_VERSIONS > versions.yml
+  "${task.process}":
+      bigslice: \$(bigslice --version 2>&1 | grep -oP 'BiG-SLiCE \\K[0-9.]+' || echo "${VERSION}")
+  END_VERSIONS
+  """
+}
diff --git a/modules/nf-core/bigslice/run/meta.yml b/modules/nf-core/bigslice/run/meta.yml
@@ -0,0 +1,50 @@
+name: bigslice_run
+description: Executes BiG-SLiCE clustering analysis on prepared BGC data using pre-trained machine learning models
+keywords:
+  - biosynthetic gene cluster
+  - BGC
+  - clustering
+  - machine learning
+  - comparative genomics
+  - secondary metabolites
+  - bigslice
+
+tools:
+  - bigslice:
+      description: A highly scalable, user-interactive tool for the large scale analysis of Biosynthetic Gene Clusters data
+      homepage: https://github.com/medema-group/BiG-SLiCE
+      documentation: https://github.com/medema-group/BiG-SLiCE/wiki
+      tool_dev_url: https://github.com/medema-group/BiG-SLiCE
+      doi: "10.1038/s41467-021-21428-1"
+      licence: ["AGPL v3"]
+
+input:
+  - input_dir:
+      type: directory
+      description: |
+        Structured input directory created by BIGSLICE_PREP_INPUT containing:
+        - datasets.tsv: dataset configuration
+        - BGC GenBank files organized by sample
+        - taxonomy information
+      pattern: "input/"
+  - models_dir:
+      type: directory
+      description: |
+        BiG-SLiCE pre-trained models directory.
+        Download from: https://github.com/medema-group/BiG-SLiCE/releases
+        Example: bigslice-models.2022-11-30.tar.gz
+      pattern: "*/"
+
+output:
+  - outdir:
+      type: directory
+      description: |
+        Complete BiG-SLiCE output directory containing:
+        - result/: clustering results and analysis data
+        - app/: interactive web application for visualization
+        - start_server.sh: script to launch web interface
+      pattern: "output/"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"