Merge pull request #4 from phac-nml/dev

Dev
phac-nml · Oct 20, 2023 · 7747dbb · 7747dbb
2 parents c73dcb9 + a8d3e68
commit 7747dbb
Show file tree

Hide file tree

Showing 13 changed files with 53 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ Mikrokondo takes in either, Illumina, Nanopore or Pacbio data (Pacbio data only
 Nextflow is required to run mikrokondo, but fortunately it is not too hard to install (Linux is required). The instructions for installing Nextflow can be found at either resource: [Nextflow Home](https://www.nextflow.io/) or  [Nextflow Documentation](https://www.nextflow.io/docs/latest/getstarted.html#installation)
 
 ## Container Engine
-Nextflow and Mikrokondo only supports running the pipeline using containers such as: Docker, Singularity (now apptainer), podman, gitpod, sifter and charliecloud. Currently only usage with Singularity has been tested, but support for each of the container services exists. Note: Singularity was adopted by the Linux Foundation and is now called Apptainer. Singularity still exists, but it is likely newer installs will use Apptainer.
+Nextflow and Mikrokondo only supports running the pipeline using containers such as: Docker, Singularity (now apptainer), podman, gitpod, shifter and charliecloud. Currently only usage with Singularity has been fully tested, (Docker and Apptainer have only been partially tested) but support for each of the container services exists. Note: Singularity was adopted by the Linux Foundation and is now called Apptainer. Singularity still exists, but it is likely newer installs will use Apptainer.
 
 ## Docker or Singularity?
 Docker or Singularity (Apptainer) Docker requires root privileges which can can make it a hassle to install on computing clusters (there are work arounds). Apptainer/Singularity does not, so running the pipeline using Apptainer/Singularity is the recommended method for running the pipeline.
@@ -52,7 +52,7 @@ Under the usage section you can find example commands, instructions for configur
 - [Decontamination Index](https://zenodo.org/record/8408557): Required for decontamination of reads (it is simply a minimap2 index)
 - [Kraken2 nt database](https://benlangmead.github.io/aws-indexes/k2): Required for binning of metagenommic data and is an alternative to using Mash for speciation
 - [Bakta database](https://zenodo.org/record/7669534): Running Bakta is optional and there is a light database option, however the full one is recommended. You will have to unzip and un-tar the database for usage.
-- [StarAMR database](https://github.com/phac-nml/staramr#database-build): Running StarAMR is optional and requires downloading the StarAMR databases. Downloading the StarAMR databases requires the above command to run.
+- [StarAMR database](https://github.com/phac-nml/staramr#database-build): Running StarAMR is optional and requires downloading the StarAMR databases. Also if you wish to avoid downloading the database, the container for StarAMR has a database included which mikrokondo will default to using if one is not specified.
 
 ### Fields to update with resources
 The above downloadable resources must be updated in the following places in your `nextflow.config`. The spots to update in the params section of the `nextflow.config` are listed below:

diff --git a/bin/kraken2_tophit.py b/bin/kraken2_tophit.py
@@ -62,7 +62,8 @@ def __init__(self, report, taxa_level) -> None:
             sys.stderr.write(f"Could not find taxa level {self.taxa_level} in output\n")
             sys.exit(-1)
         self.top_hit = self.select_top_hit(list(self.selected_taxa))
-        sys.stdout.write(f"{self.top_hit.SciName.replace('"', '')}")
+        output = self.top_hit.SciName.replace('"', '')
+        sys.stdout.write(f"{output}")
 
     def select_top_hit(self, taxa_row: list):
         """Pick the top hit of the selected data

diff --git a/conf/modules.config b/conf/modules.config
@@ -53,14 +53,15 @@ process {
     }
 
     withName: REPORT_TO_TSV{
-            executor = 'local'
-            cache = 'false' // Resume does not work on module, if enabled a warning is thrown
-            errorStrategy = "terminate"
-            publishDir = [
-                mode: params.publish_dir_mode,
-                path: "${params.outdir}/SummaryReport",
-                pattern: "final_report.tsv"
-            ]
+        ext.containers = params.python3
+        executor = 'local'
+        cache = 'false' // Resume does not work on module, if enabled a warning is thrown
+        errorStrategy = "terminate"
+        publishDir = [
+            mode: params.publish_dir_mode,
+            path: "${params.outdir}/SummaryReport",
+            pattern: "final_report.tsv"
+        ]
     }
 
     withName: SHIGATYPER {
@@ -87,6 +88,7 @@ process {
     //}
 
     withName: BIN_KRAKEN2 {
+        ext.containers = params.python3
         maxForks = 20;
         publishDir = [
             mode: params.publish_dir_mode,
@@ -105,20 +107,23 @@ process {
     }
 
     withName: CHECK_ONT {
+        ext.containers = params.python3
         publishDir = [
             enabled: false
         ]
 
     }
 
     withName: PARSE_MASH {
+        ext.containers = params.python3
         errorStrategy = { task.exitStatus == 255 || task.exitStatus == 1 ? 'ignore' : 'finish'}
         publishDir = [
             enabled: false
         ]
     }
 
     withName: PARSE_KRAKEN {
+        ext.containers = params.python3
         errorStrategy = { task.exitStatus == 255 || task.exitStatus == 1 ? 'ignore' : 'finish'}
         publishDir = [
             enabled: false
@@ -127,6 +132,7 @@ process {
 
     withName: READ_SCAN {
         errorStrategy = "terminate"
+        ext.containers = params.python3
         publishDir = [
             [
                 mode: params.publish_dir_mode,
@@ -391,6 +397,7 @@ process {
 
     withName: PARSE_KAT {
         // scratch = false
+        ext.containers = params.python3
         executor = 'local'
         errorStrategy = "terminate"
 

diff --git a/docs/usage/configuration.md b/docs/usage/configuration.md
@@ -245,6 +245,14 @@ Some processes only utilize bash scripting, normally Nextflow will utilize syste
     - singularity: coreutils singularity container
     - docker: coreutils docker container
 
+
+### Python
+Some scripts require Python, and to prevent someone requiring a Python we are just putting the requirement into a container for you. Also as all the scripts within mikrokondo use only the the standard library you can swap these containers to use **pypy3** and get a massive performance boost from the scripts!
+
+- python3
+    - singularity: Python3 singularity container
+    - docker: Python3 docker container
+
 ### KAT
 Kat was previously used to estimate genome size, however at the time of writing KAT appears to be only infrequently updated and newer versions would have issues running/sometimes giving an incorrect output due to failures in peak recognition KAT has been removed from the pipeline. It's code still remains but it **will be removed in the future**.
 

diff --git a/modules/local/bin_kraken2.nf b/modules/local/bin_kraken2.nf
@@ -8,6 +8,7 @@ process BIN_KRAKEN2{
     tag "$meta.id"
     label "process_low"
     cache 'deep' // ! Deep caching is required to not bungle up the later metadata updates on resumes
+    container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}"
 
     input:
     tuple val(meta), path(contigs), path(kraken_report), path(kraken_output)

diff --git a/modules/local/check_ont.nf b/modules/local/check_ont.nf
@@ -3,10 +3,12 @@
 process CHECK_ONT{
     tag "$meta.id"
     label "process_single"
+    container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}"
 
 
     // TODO add to publish dir
     // TODO perhaps reads should just be dedupped by header...
+    // TODO Awk would be faster...
 
     input:
     tuple val(meta), path(reads)

diff --git a/modules/local/parse_kraken.nf b/modules/local/parse_kraken.nf
@@ -3,6 +3,7 @@
 process PARSE_KRAKEN {
     tag "$meta.id"
     label "process_low"
+    container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}"
 
     input:
     tuple val(meta), path(kraken_report)

diff --git a/modules/local/parse_mash.nf b/modules/local/parse_mash.nf
@@ -6,7 +6,7 @@
 process PARSE_MASH{
     tag "$meta.id"
     label "process_low"
-
+    container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}"
 
     input:
     tuple val(meta), path(mash_screen)

diff --git a/modules/local/read_summary.nf b/modules/local/read_summary.nf
@@ -5,6 +5,7 @@
 process READ_SCAN{
     label 'process_medium'
     tag "${meta.id}"
+    container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}"
 
     input:
     tuple val(meta), path(reads), path(l_reads)

diff --git a/modules/local/report_to_tsv.nf b/modules/local/report_to_tsv.nf
@@ -4,7 +4,7 @@
 
 process REPORT_TO_TSV{
     tag "Report to TSV"
-
+    container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.containers.get('singularity') : task.ext.containers.get('docker')}"
 
     input:
     path summary_report

diff --git a/nextflow.config b/nextflow.config
@@ -99,7 +99,7 @@ params {
 
 
     // If a param in camel case is present nextflow automatically creates a kebab case parameter as well
-    schema_ignore_params       = 'pointfinder_db_tag,staramr,mobsuite_recon,skip_staramr,genomes,flye_read_type,shigeifinder,lissero,sistr,ectyper,bandage,bakta,unicycler,medaka,pilon_iterative,pilon,racon,samtools,minimap2,r_contaminants,mash,kraken,checkm,quast_filter,quast,fastqc,spades,flye,chopper,fastp,seqtk,kat,coreutils,opt_platforms,QCReportFields,QCReport-fields,QCReport,kraken_bin,shigatyper,spatyper,kleborate,subtyping_report,kraken_species,top_hit_species,mash_meta,mlst,raw_reads,abricate_params,target_depth'
+    schema_ignore_params       = 'python3,pointfinder_db_tag,staramr,mobsuite_recon,skip_staramr,genomes,flye_read_type,shigeifinder,lissero,sistr,ectyper,bandage,bakta,unicycler,medaka,pilon_iterative,pilon,racon,samtools,minimap2,r_contaminants,mash,kraken,checkm,quast_filter,quast,fastqc,spades,flye,chopper,fastp,seqtk,kat,coreutils,opt_platforms,QCReportFields,QCReport-fields,QCReport,kraken_bin,shigatyper,spatyper,kleborate,subtyping_report,kraken_species,top_hit_species,mash_meta,mlst,raw_reads,abricate_params,target_depth'
 
     stage_in_mode = 'symlink'
 
@@ -141,6 +141,12 @@ params {
         docker = 'quay.io/biocontainers/coreutils:8.31--h14c3975_0'
     }
 
+    // Python container, May switch for pypy3
+    python3 {
+        singularity = "https://depot.galaxyproject.org/singularity/python%3A3.10.4"
+        docker = "docker://python/python:3.10.4-alpine3.16"
+    }
+
     //KAT options
     kat {
         // using a different container as the Kat version kept in other containers segfaulted

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -2587,6 +2587,16 @@
             "type": "string",
             "default": "PointfinderDB",
             "hidden": true
+        },
+        "python3.singularity": {
+            "type": "string",
+            "default": "https://depot.galaxyproject.org/singularity/python%3A3.10.4",
+            "hidden": true
+        },
+        "python3.docker": {
+            "type": "string",
+            "default": "docker://python/python:3.10.4-alpine3.16",
+            "hidden": true
         }
     }
 }
diff --git a/subworkflows/local/determine_species.nf b/subworkflows/local/determine_species.nf
@@ -29,9 +29,11 @@ workflow DETERMINE_SPECIES {
             meta, report -> tuple(meta, params.kraken, report)
         })
 
+        parsed = PARSE_KRAKEN(KRAKEN.out.report)
         reports = reports.mix(parsed.kraken_top.map{
             meta, report -> tuple(meta, params.top_hit_species, report)
         })
+
         top_hit = parsed.kraken_top
         versions = versions.mix(parsed.versions)
         versions = versions.mix(KRAKEN.out.versions)