diff --git a/.wordlist.txt b/.wordlist.txt index ca3a4b14..f1e3227e 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -169,3 +169,9 @@ dependentRequired errorMessage Samplesheet TSeemann's +RASUSA +downsampling +Christy +Marinier +Petkau + diff --git a/CHANGELOG.md b/CHANGELOG.md index 13c5e445..23fc77ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,10 +5,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### `Changed` + +- Added RASUSA for down sampling of Nanopore or PacBio data. [PR 125](https://github.com/phac-nml/mikrokondo/pull/125) + ### `Updated` - Documentation and workflow diagram has been updated. [PR 123](https://github.com/phac-nml/mikrokondo/pull/123) +- Documentation and Readme has been updated. [PR 126](https://github.com/phac-nml/mikrokondo/pull/126) + ## [0.4.2] - 2024-09-25 ### `Fixed` diff --git a/README.md b/README.md index cc9948ed..b9d4620d 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ This workflow will detect what pathogen(s) is present and apply the applicable m This software (currently unpublished) can be cited as: -- Wells, M. "mikrokondo" Github +- Matthew Wells, James Robertson, Aaron Petkau, Christy-Lynn Peterson, Eric Marinier. "mikrokondo" Github An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. @@ -103,33 +103,24 @@ The above downloadable resources must be updated in the following places in your ``` // Bakta db path, note the quotation marks -bakta { - db = "/PATH/TO/BAKTA/DB" -} +bakta_db = "/PATH/TO/BAKTA/DB" // Decontamination minimap2 index, note the quotation marks -r_contaminants { - mega_mm2_idx = "/PATH/TO/DECONTAMINATION/INDEX" -} +dehosting_idx = "/PATH/TO/DECONTAMINATION/INDEX" // kraken db path, not the quotation marks -kraken { - db = "/PATH/TO/KRAKEN/DATABASE/" -} +kraken2_db = "/PATH/TO/KRAKEN/DATABASE/" // GTDB Mash sketch, note the quotation marks -mash { - mash_sketch = "/PATH/TO/MASH/SKETCH/" -} +mash_sketch = "/PATH/TO/MASH/SKETCH/" // STARAMR database path, note the quotation marks // Passing in a StarAMR database is optional if one is not specified the database in the container will be used. You can just leave the db option as null if you do not wish to pass one -staramr { - db = "/PATH/TO/STARMAR/DB" -} - +staramr_db = "/PATH/TO/STARMAR/DB" ``` +The above parameters can be accessed for the command line as for passing arguments to the pipeline if not set in the `nextflow.config` file. + # Getting Started ## Usage diff --git a/conf/modules.config b/conf/modules.config index beedf128..8daefc95 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -332,6 +332,20 @@ process { ] } + withName: RASUSA { + ext.args = "" + ext.parameters = params.rasusa + publishDir = [ + [ + path: { [ "${task.read_downsampled_directory_name}", "Rasusa" ].join(File.separator) }, + mode: params.publish_dir_mode, + pattern: "*${params.rasusa.reads_ext}", + saveAs: { filename -> + filename.equals('versions.yml') ? null : reformat_output(filename, "reads", "rasusa.sample", meta) } + ] + ] + } + withName: SEQTK_SIZE { ext.args = "" diff --git a/docs/subworkflows/clean_reads.md b/docs/subworkflows/clean_reads.md index 08b1c3fe..70217e6f 100644 --- a/docs/subworkflows/clean_reads.md +++ b/docs/subworkflows/clean_reads.md @@ -1,49 +1,49 @@ -# Read Quality Control - -## subworkflows/local/clean_reads - -## Steps -1. **Reads are decontaminated** using [minimap2](https://github.com/lh3/minimap2), against a 'sequencing off-target' index. This index contains: - - Reads associated with Humans (de-hosting) - - Known sequencing controls (phiX) - -2. **Read quality filtering and trimming** is performed using [FastP](https://github.com/OpenGene/fastp) - - Currently no adapters are specified within FastP when it is run and auto-detection is used. - - FastP parameters can be altered within the [nextflow.config](https://github.com/phac-nml/mikrokondo/blob/main/nextflow.config) file. - - Long read data is also run through FastP for gathering of summary data, however long read (un-paired reads) trimming is not performed and only summary metrics are generated. [Chopper](https://github.com/wdecoster/chopper) is currently integrated in MikroKondo but it has been removed from this workflow due to a lack of interest in quality trimming of long read data. It may be reintroduced in the future upon request. - -3. **Genome size estimation** is performed using [Mash](https://github.com/marbl/Mash) Sketch of reads and estimated genome size is output. - -4. **Read downsampling** (OPTIONAL) an estimated depth threshold can be specified to down sample large read sets. This step can be used to improve genome assembly quality, and is something that can be found in other assembly pipelines such as [Shovill](https://github.com/tseemann/shovill). To disable down sampling add `--skip_depth_sampling true` to your command line. - - Depth is estimated by using the estimated genome size output from [Mash](https://github.com/marbl/Mash) - - Total basepairs are taken from [FastP](https://github.com/OpenGene/fastp) - - Read downsampling is then performed using [Seqtk](https://github.com/lh3/seqtk) - -5. **Metagenomic assesment** using a custom [Mash](https://github.com/marbl/Mash) 'sketch' file generated from the Genome Taxonomy Database [GTDB](https://gtdb.ecogenomic.org/) and the mash_screen module, this step assesses how many bacterial genera are present in a sample (e.g. a contaminated or metagenomic sample may have more than one genus of bacteria present) with greater than 90% identity (according to Mash). When more than 1 taxa are present, the metagenomic tag is set, turning on metagenomic assembly in later steps. Additionally Kraken2 will be run on metagenomic assemblies and contigs will be binned at a defined taxonomic level (default level: genus). - -6. **Nanopore ID screening** duplicate Nanopore read ID's have been known to cause issues in the pipeline downstream. In order to bypass this issue, an option can be toggled where a script will read in Nanopore reads and append a unique ID to the header, this process can be slow so default setting is to skip, `--skip_ont_header_cleaning true`. - -## Input -- Next generation sequencing reads: - + Short read - Illumina - + Long read: - * Nanopore - * Pacbio -- User submitted sample sheet - - -## Outputs -- Reads - - FinalReads - - SAMPLE - - Processing - - Dehosting - - Trimmed - - FastP - - Seqtk - - MashSketches - - Quality - - RawReadQuality - - Trimmed - - FastP - - MashScreen +# Read Quality Control + +## subworkflows/local/clean_reads + +## Steps +1. **Reads are decontaminated** using [minimap2](https://github.com/lh3/minimap2), against a 'sequencing off-target' index. This index contains: + - Reads associated with Humans (de-hosting) + - Known sequencing controls (phiX) + +2. **Read quality filtering and trimming** is performed using [FastP](https://github.com/OpenGene/fastp) + - Currently no adapters are specified within FastP when it is run and auto-detection is used. + - FastP parameters can be altered within the [nextflow.config](https://github.com/phac-nml/mikrokondo/blob/main/nextflow.config) file. + - Long read data is also run through FastP for gathering of summary data, however long read (un-paired reads) trimming is not performed and only summary metrics are generated. [Chopper](https://github.com/wdecoster/chopper) is currently integrated in MikroKondo but it has been removed from this workflow due to a lack of interest in quality trimming of long read data. It may be reintroduced in the future upon request. + +3. **Genome size estimation** is performed using [Mash](https://github.com/marbl/Mash) Sketch of reads and estimated genome size is output. + +4. **Read down sampling** (OPTIONAL) an estimated depth threshold can be specified to down sample large read sets. This step can be used to improve genome assembly quality, and is something that can be found in other assembly pipelines such as [Shovill](https://github.com/tseemann/shovill). To disable down sampling add `--skip_depth_sampling true` to your command line. + - Depth is estimated by using the estimated genome size output from [Mash](https://github.com/marbl/Mash) + - Total base pairs are taken from [FastP](https://github.com/OpenGene/fastp) + - Read down sampling is then performed using [Seqtk](https://github.com/lh3/seqtk) (Illumina) or [Rasusa](https://github.com/mbhall88/rasusa) (Nanopore or Pacbio). + +5. **Metagenomic assessment** using a custom [Mash](https://github.com/marbl/Mash) 'sketch' file generated from the Genome Taxonomy Database [GTDB](https://gtdb.ecogenomic.org/) and the mash_screen module, this step assesses how many bacterial genera are present in a sample (e.g. a contaminated or metagenomic sample may have more than one genus of bacteria present) with greater than 90% identity (according to Mash). When more than 1 taxa are present, the metagenomic tag is set, turning on metagenomic assembly in later steps. Additionally Kraken2 will be run on metagenomic assemblies and contigs will be binned at a defined taxonomic level (default level: genus). + +6. **Nanopore ID screening** duplicate Nanopore read ID's have been known to cause issues in the pipeline downstream. In order to bypass this issue, an option can be toggled where a script will read in Nanopore reads and append a unique ID to the header, this process can be slow so default setting is to skip, `--skip_ont_header_cleaning true`. + +## Input +- Next generation sequencing reads: + + Short read - Illumina + + Long read: + * Nanopore + * Pacbio +- User submitted sample sheet + + +## Outputs +- Reads + - FinalReads + - SAMPLE + - Processing + - Dehosting + - Trimmed + - FastP + - Seqtk + - MashSketches + - Quality + - RawReadQuality + - Trimmed + - FastP + - MashScreen diff --git a/docs/usage/installation.md b/docs/usage/installation.md index 9a1b81a9..6d67f32c 100644 --- a/docs/usage/installation.md +++ b/docs/usage/installation.md @@ -1,64 +1,56 @@ -# Installation - -## Dependencies -- Python (3.10>=) -- Nextflow (22.10.1>=) -- Container service (Docker, Singularity, Apptainer have been tested) -- The source code: `git clone https://github.com/phac-nml/mikrokondo.git` - -**Dependencies can be installed with Conda (e.g. Nextflow and Python)**. - -## To install mikrokondo -Once all dependencies are installed (see below for instructions), to download the pipeline run: - -`git clone https://github.com/phac-nml/mikrokondo.git` - -## Installing Nextflow -Nextflow is required to run mikrokondo (requires Linux), and instructions for its installation can be found at either: [Nextflow Home](https://www.nextflow.io/) or [Nextflow Documentation](https://www.nextflow.io/docs/latest/getstarted.html#installation) - -## Container Engine -Nextflow and Mikrokondo require the use of containers to run the pipeline, such as: Docker, Singularity (now apptainer), podman, gitpod, sifter and charliecloud. - -> **NOTE:** Singularity was adopted by the Linux Foundation and is now called Apptainer. Singularity still exists, however newer installs will likely use Apptainer. - -## Docker or Singularity? -Docker requires root privileges which can can make it a hassle to install on computing clusters, while there are work arounds, Apptainer/Singularity does not. Therefore, using Apptainer/Singularity is the recommended method for running the mikrokondo pipeline. - -### Issues -Containers are not perfect, below is a list of some issues you may face using containers in mikrokondo, fixes for each issue will be detailed here as they are identified. - -- **Exit code 137,** usually means the docker container used to much memory. - -## Resources to download -- [GTDB Mash Sketch](https://zenodo.org/record/8408361): required for speciation and determination when sample is metagenomic -- [Decontamination Index](https://zenodo.org/record/8408557): Required for decontamination of reads (this is a minimap2 index) -- [Kraken2 std database](https://benlangmead.github.io/aws-indexes/k2): Required for binning of metagenommic data and is an alternative to using Mash for speciation -- [Bakta database](https://zenodo.org/record/7669534): Running Bakta is optional and there is a light database option, however the full one is recommended. You will have to unzip and un-tar the database for usage. - -### Fields to update with resources -It is recommended to store the above resources within the `databases` folder in the mikrokondo folder, this allows for a simple update to the names of the database in `nextflow.config` rather than a need for a full path description. - -Below shows where to update database resources in the `params` section of the `nextflow.config` file: - -``` -// Bakta db path, note the quotation marks -bakta { - db = "/PATH/TO/BAKTA/DB" -} - -// Decontamination minimap2 index, note the quotation marks -r_contaminants { - mega_mm2_idx = "/PATH/TO/DECONTAMINATION/INDEX" -} - -// kraken db path, not the quotation marks -kraken { - db = "/PATH/TO/KRAKEN/DATABASE/" -} - -// GTDB Mash sketch, note the quotation marks -mash { - mash_sketch = "/PATH/TO/MASH/SKETCH/" -} - -``` +# Installation + +## Dependencies +- Python (3.10>=) +- Nextflow (22.10.1>=) +- Container service (Docker, Singularity, Apptainer have been tested) +- The source code: `git clone https://github.com/phac-nml/mikrokondo.git` + +**Dependencies can be installed with Conda (e.g. Nextflow and Python)**. + +## To install mikrokondo +Once all dependencies are installed (see below for instructions), to download the pipeline run: + +`git clone https://github.com/phac-nml/mikrokondo.git` + +## Installing Nextflow +Nextflow is required to run mikrokondo (requires Linux), and instructions for its installation can be found at either: [Nextflow Home](https://www.nextflow.io/) or [Nextflow Documentation](https://www.nextflow.io/docs/latest/getstarted.html#installation) + +## Container Engine +Nextflow and Mikrokondo require the use of containers to run the pipeline, such as: Docker, Singularity (now apptainer), podman, gitpod, sifter and charliecloud. + +> **NOTE:** Singularity was adopted by the Linux Foundation and is now called Apptainer. Singularity still exists, however newer installs will likely use Apptainer. + +## Docker or Singularity? +Docker requires root privileges which can can make it a hassle to install on computing clusters, while there are workarounds, Apptainer/Singularity does not. Therefore, using Apptainer/Singularity is the recommended method for running the mikrokondo pipeline. + +### Issues +Containers are not perfect, below is a list of some issues you may face using containers in mikrokondo, fixes for each issue will be detailed here as they are identified. + +- **Exit code 137,** usually means the docker container used to much memory. + +## Resources to download +- [GTDB Mash Sketch](https://zenodo.org/record/8408361): required for speciation and determination when sample is metagenomic +- [Decontamination Index](https://zenodo.org/record/8408557): Required for decontamination of reads (this is a minimap2 index) +- [Kraken2 std database](https://benlangmead.github.io/aws-indexes/k2): Required for binning of metagenomic data and is an alternative to using Mash for speciation +- [Bakta database](https://zenodo.org/record/7669534): Running Bakta is optional and there is a light database option, however the full one is recommended. You will have to unzip and un-tar the database for usage. + +### Fields to update with resources +It is recommended to store the above resources within the `databases` folder in the mikrokondo folder, this allows for a simple update to the names of the database in `nextflow.config` rather than a need for a full path description. + +Below shows where to update database resources in the `params` section of the `nextflow.config` file: + +``` +// Bakta db path, note the quotation marks +bakta_db = "/PATH/TO/BAKTA/DB" + +// Decontamination minimap2 index, note the quotation marks +dehosting_idx = "/PATH/TO/DECONTAMINATION/INDEX" + +// kraken db path, not the quotation marks +kraken2_db = "/PATH/TO/KRAKEN/DATABASE/" + +// GTDB Mash sketch, note the quotation marks +mash_sketch = "/PATH/TO/MASH/SKETCH/" + +``` diff --git a/docs/usage/tool_params.md b/docs/usage/tool_params.md index 7bc39444..aa0d2c13 100644 --- a/docs/usage/tool_params.md +++ b/docs/usage/tool_params.md @@ -15,15 +15,15 @@ Screens contigs for antimicrobial and virulence genes. If you wish to use a diff - singularity: Abricate singularity container - docker: Abricate docker container - **args**: Can be a string of additional command line arguments to pass to abricate - - report_tag: determines the name of the Abricate output in the final summary file. **Do no touch this unless doing pipeline development.** - - header_p: This field tells the report module that the Abricate output contains headers. **Do no touch this unless doing pipeline development.** + - report_tag: determines the name of the Abricate output in the final summary file. **Do not change this unless doing pipeline development.** + - header_p: This field tells the report module that the Abricate output contains headers. **Do not change this unless doing pipeline development.** ### Raw Read Metrics A custom Python script that gathers quality metrics for each fastq file. - raw_reads - high_precision: When set to true, floating point precision of values output are accurate down to very small decimal places. Recommended to leave this setting as false (use the standard floats), it is much faster and having such precise decimal places is not needed for this module. - - report_tag: this field determines the name of the Raw Read Metric field in the final summary report. **Do no touch this unless doing pipeline development.** + - report_tag: this field determines the name of the Raw Read Metric field in the final summary report. **Do not change this unless doing pipeline development.** ### Coreutils In cases where a process uses bash scripting only, Nextflow by default will utilize system binaries when they are available and no container is specified. For reproducibility, we have chosen to use containers in such cases. When a better container is available, you can direct the pipeline to use it via below commands: @@ -47,12 +47,21 @@ Kat was previously used to estimate genome size, however at the time of writing Seqtk is used for both the sub-sampling of reads and conversion of fasta files to fastq files in mikrokondo. The usage of seqtk to convert a fasta to a fastq is needed in certain typing tools requiring reads as input (this was a design decision to keep the pipeline generalizable). - seqtk - - singularity: singularity container for seqtk - - docker: docker container for seqtk + - singularity: Singularity container for seqtk + - docker: Docker container for seqtk - seed: A seed value for sub-sampling - reads_ext: Extension of reads after sub-sampling, do not touch alter this unless doing pipeline development. - - assembly_fastq: Extension of the fastas after being converted to fastq files. Do no touch this unless doing pipeline development. - - report_tag: Name of seqtk data in the final summary report. Do no touch this unless doing pipeline development. + - assembly_fastq: Extension of the fastas after being converted to fastq files. Do not change this unless doing pipeline development. + - report_tag: Name of seqtk data in the final summary report. Do not change this unless doing pipeline development. + +### Rasusa +For long read data Rasusa is used for down sampling as it take read length into consideration when down sampling. + +- rasusa + - singularity: singularity container for rasusa + - docker: docker container for rasusa + - seed: A seed value for sub-sampling + - reads_ext: The extension of the generated fastq files. Do not change this unless doing pipeline development. ### FastP Fastp is fast and widely used program for gathering of read quality metrics, adapter trimming, read filtering and read trimming. FastP has extensive options for configuration which are detailed in their documentation, but sensible defaults have been set. **Adapter trimming in Fastp is performed using overlap analysis, however if you do not trust this you can specify the sequencing adapters used directly in the additional arguments for Fastp**. @@ -60,8 +69,8 @@ Fastp is fast and widely used program for gathering of read quality metrics, ada - fastp - singularity: singularity container for FastP - docker: docker container for FastP - - fastq_ext: extension of the output Fastp trimmed reads, do not touch this unless doing pipeline development. - - html_ext: Extension of the html report output by fastp, do no touch unless doing pipeline development. + - fastq_ext: extension of the output Fastp trimmed reads, Do not change this unless doing pipeline development. + - html_ext: Extension of the html report output by fastp, Do not touch unless doing pipeline development. - json_ext: Extension of json report output by FastP do not touch unless doing pipeline development. - report_tag: Title of FastP data in the summary report. - **average_quality_e**: If a read/read-pair quality is less than this value it is discarded. Can be set from the command line with `--fp_average_quality`. diff --git a/docs/usage/usage.md b/docs/usage/usage.md index 0ea8fcc7..b2cf6c6f 100644 --- a/docs/usage/usage.md +++ b/docs/usage/usage.md @@ -109,7 +109,7 @@ Numerous steps within mikrokondo can be turned off without compromising the stab - `--skip_subtyping`: to turn off automatic triggering of subtyping in the pipeline (useful when target organism does not have a subtyping tool installed within mikrokondo). - `--skip_version_gathering`: prevents the collation of tool versions. This process generally takes a couple minutes (at worst) but can be useful when during recurrent runs of the pipeline (like when testing settings). - `--skip_report`: Prevents creation of final report summary report amalgamating outputs of all other files, this will also turnoff the creation of individual sub-reports. -- `--skip_metagenomic_detection`: Skips classification of sample as metagnomic and forces a sample to be analyzed as an isolate. +- `--skip_metagenomic_detection`: Skips classification of sample as metagenomic and forces a sample to be analyzed as an isolate. - `--skip_raw_read_metrics`: Prevents generation of raw read metrics, e.g. metrics generated about the reads before any trimming or filtering is performed. - `--skip_mlst`: Skip seven gene MLST. - `--skip_length_filtering_contigs`: Skip length filtering of contigs based on the `--qt_min_contig_length` parameter. @@ -128,7 +128,7 @@ Different databases/pre-computed files are required for usage within mikrokondo. Allele scheme selection parameters. - `--override_allele_scheme`: Provide the path to an allele scheme (currently only locidex is supported) that will be used for all samples provided. e.g. no automated allele database selection is performed, this scheme will be applied. -- `--lx_allele_database`: A path to a `manifest.json` file used by locidex for automated allele selection. This option cannot be used along side `--overrided_allele_scheme`. +- `--lx_allele_database`: A path to a `manifest.json` file used by locidex for automated allele selection. This option cannot be used along side `--override_allele_scheme`. >**Note:** The provide only a path to the `manifest.json` file as `some/directory` **NOT** `some/directory/manifest.json` @@ -187,7 +187,7 @@ Top level parameters for Locidex. The currently implemented allele caller, do no - `--lx_max_dna_len`: Global maximum query length of DNA strand. - `--lx_max_aa_len`: Global maximum query length of Amino Acid strand. - `--lx_min_dna_ident`: Global minimum DNA percent identity required for match. (float). -- `--lx_min_aa_ident`: Global minimum Amino Acid percent identiy required for match. (float). +- `--lx_min_aa_ident`: Global minimum Amino Acid percent identity required for match. (float). - `--lx_min_dna_match_cov`: Global minimum DNA percent hit coverage identity required for match (float). - `--lx_min_aa_match_cov`: Global minimum Amino Acid hit coverage identity required for match (float). - `--lx_max_target_seqs`: Maximum number of sequence hits per query. @@ -219,7 +219,7 @@ Different container services can be specified from the command line when running #### Slurm options -- `slurm_p true`: slurm execurtor will be used. +- `slurm_p true`: slurm executor will be used. - `slurm_profile STRING`: a string to allow the user to specify which slurm partition to use. ## Output diff --git a/modules/local/rasusa.nf b/modules/local/rasusa.nf new file mode 100644 index 00000000..97fd49af --- /dev/null +++ b/modules/local/rasusa.nf @@ -0,0 +1,26 @@ +/* + Downsample long reads with Rasusa +*/ + +process RASUSA { + tag "$meta.id" + label 'process_low' + container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}" + + input: + tuple val(meta), path(reads), val(sample_fraction) + + output: + tuple val(meta), path("*${params.rasusa.reads_ext}"), val(sample_fraction), emit: sampled_reads + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + rasusa reads -f ${sample_fraction} -s ${params.rasusa.seed} -O g -o ${prefix}${params.rasusa.reads_ext} ${reads.join(" ")} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rasusa: \$(rasusa --version 2>&1 | sed -e "s/rasusa //g") + END_VERSIONS + """ +} diff --git a/modules/local/remove_contaminants.nf b/modules/local/remove_contaminants.nf index 17dcf95d..6e11290e 100644 --- a/modules/local/remove_contaminants.nf +++ b/modules/local/remove_contaminants.nf @@ -57,7 +57,6 @@ process REMOVE_CONTAMINANTS { reads_out = "-1 ${meta.id}.R1.${params.r_contaminants.samtools_output_suffix}${params.r_contaminants.samtools_output_ext} -2 ${meta.id}.R2.${params.r_contaminants.samtools_output_suffix}${params.r_contaminants.samtools_output_ext} -s ${meta.id}${params.r_contaminants.samtools_singletons_ext}" } def zip_singletons = singled_ended ? "" : "gzip *${params.r_contaminants.samtools_singletons_ext}" - // TODO currently using a megaindex, but there may be a better way // -f4 in samtool view filters out unmapped reads // -N added to add /1 and /2 to reads with the same name diff --git a/modules/local/seqtk_sample.nf b/modules/local/seqtk_sample.nf index 61628b94..6d30633d 100644 --- a/modules/local/seqtk_sample.nf +++ b/modules/local/seqtk_sample.nf @@ -10,7 +10,7 @@ process SEQTK_SAMPLE{ tuple val(meta), path(reads), val(sample_fraction) output: - // TODO outputting sample fraction to match cardinality of non sampled read set, need to find a better solution... + // Outputting sample fraction to match cardinality of non sampled read set, should not be passed to the process in the future. tuple val(meta), path("*${params.seqtk.reads_ext}"), val(sample_fraction), emit: sampled_reads path "versions.yml", emit: versions diff --git a/nextflow.config b/nextflow.config index 7379c296..a60c9faf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -197,6 +197,13 @@ params { docker = "biocontainers/python:3.12" } + rasusa { + docker = "biocontainers/rasusa:2.1.0--h715e4b3_0" + singularity = "https://depot.galaxyproject.org/singularity/rasusa:2.1.0--h715e4b3_0" + reads_ext = ".sampled.fastq.gz" + seed = 42 + } + seqtk { singularity = 'https://depot.galaxyproject.org/singularity/seqtk%3A1.4--he4a0461_1' docker = 'biocontainers/seqtk:1.4--he4a0461_1' diff --git a/subworkflows/local/clean_reads.nf b/subworkflows/local/clean_reads.nf index f8fc73d6..74f7cfcf 100644 --- a/subworkflows/local/clean_reads.nf +++ b/subworkflows/local/clean_reads.nf @@ -1,5 +1,5 @@ // Workflow for the cleaning up reads -// TODO Kat can take in all hybrid assembly files at once + include { FASTP_TRIM } from '../../modules/local/fastp_trim.nf' include { PARSE_FASTP } from '../../modules/local/parse_fastp.nf' include { CHOPPER_TRIM } from '../../modules/local/chopper_trim.nf' @@ -7,11 +7,10 @@ include { MASH_SCREEN } from '../../modules/local/mash_screen.nf' include { MASH_ESTIMATE } from '../../modules/local/mash_estimate.nf' include { REMOVE_CONTAMINANTS } from '../../modules/local/remove_contaminants.nf' include { PARSE_MASH } from '../../modules/local/parse_mash.nf' -include { KAT_HIST } from '../../modules/local/kat_hist.nf' -include { PARSE_KAT } from '../../modules/local/parse_kat.nf' include { CHECK_ONT } from '../../modules/local/check_ont.nf' include { FASTQC } from '../../modules/nf-core/fastqc/main.nf' include { SEQTK_SAMPLE } from '../../modules/local/seqtk_sample.nf' +include { RASUSA } from '../../modules/local/rasusa.nf' @@ -48,7 +47,7 @@ workflow QC_READS { // TODO add in nanoplot for nanopore data take: - reads // channel [[meta etc], [Read paths], opt: long reads] + reads // channel [[meta etc], [[Read paths], opt: long reads]] platform // platform opt main: @@ -56,17 +55,10 @@ workflow QC_READS { versions = Channel.empty() def platform_comp = platform.toString() - - // TODO add in code to check that there are always enough reads left over after decontamination - // TODO need to make sure that if one read is unmapped the other is not included as well deconned_reads = REMOVE_CONTAMINANTS(reads, params.r_contaminants.mega_mm2_idx ? file(params.r_contaminants.mega_mm2_idx) : error("--dehosting_idx ${params.dehosting_idx} is invalid"), Channel.value(platform_comp)) versions = versions.mix(REMOVE_CONTAMINANTS.out.versions) - ch_meta_cleaned_reads = FASTP_TRIM(deconned_reads.reads) // can use the json output of this to decide if chopper should be run - - - reports = reports.mix(ch_meta_cleaned_reads.fastp_json.map{ meta, json -> tuple(meta, params.fastp, json) }) @@ -81,7 +73,6 @@ workflow QC_READS { failed: true } - // This can be condensed to one line... reports = reports.mix(reads_passed.failed.map{ meta, count -> tuple(meta, params.filtered_reads, false) @@ -105,11 +96,9 @@ workflow QC_READS { hyb_lr = true } - // TODO move subsampling into a seperate workflow // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Sampling depth estimation for each set of reads // It is requested that only sub-sampled reads go on for further analysis e.g. when calculating coverage only downsampled reads are used - // TODO determine if metagenomic samples should be down sampled read_sketch = MASH_ESTIMATE(filtered_samples, Channel.value(hyb_lr)) genome_sizes = read_sketch.gsize.map{ meta, gsize -> tuple(meta, get_size(gsize)) @@ -146,20 +135,39 @@ workflow QC_READS { log.info "Not down sampling ${it[0].id} as estimated sample depth is already below targeted depth of ${params.target_depth}." } + to_down_sample = reads_sample.sub_sample.branch { it -> + short_reads: !it[0].single_end + long_reads: true + } - down_sampled_reads = SEQTK_SAMPLE(reads_sample.sub_sample) - reports = reports.mix(down_sampled_reads.sampled_reads.map{ + // Short reads and hybrid reads sets get sampled with seqtk still. + //*~~~~~~~~~~~~~~~~~~~~~~~ + //* Seqtk is still being used for short reads and hybrid read sets until validation is finished. + //* as rasusa can then be validated for the rest of the workflow afterwards + //*~~~~~~~~~~~~~~~~~~~~~~~~ + down_sampled_reads_sr_hybr = SEQTK_SAMPLE(to_down_sample.short_reads) + reports = reports.mix(down_sampled_reads_sr_hybr.sampled_reads.map{ meta, reads, down_sampling -> tuple(meta, params.seqtk, down_sampling) }) + versions = versions.mix(down_sampled_reads_sr_hybr.versions) + + + // Long reads get downsampled with RASUSA + down_sampled_reads_lr = RASUSA(to_down_sample.long_reads) + reports = reports.mix(down_sampled_reads_lr.sampled_reads.map{ + meta, reads, down_sampling -> tuple(meta, params.rasusa, down_sampling) + }) + versions = versions.mix(down_sampled_reads_lr.versions) - reads_down_sampled_updated = down_sampled_reads.sampled_reads.map{ + // Mix downsampled reads back into same channel + down_sampled_reads = down_sampled_reads_sr_hybr.sampled_reads.mix(down_sampled_reads_lr.sampled_reads) + + reads_down_sampled_updated = down_sampled_reads.map{ meta, reads, sampling_factor -> meta.downsampled = true tuple(meta, reads, sampling_factor) } - versions = versions.mix(down_sampled_reads.versions) - ch_prepped_reads = reads_sample.other.mix(reads_down_sampled_updated).map{ meta, reads, sampling_factor -> tuple(meta, reads) } @@ -170,7 +178,6 @@ workflow QC_READS { } mash_screen_out = MASH_SCREEN(ch_prepped_reads, params.mash.mash_sketch ? file(params.mash.mash_sketch) : error("--mash_sketch ${params.mash_sketch} is invalid")) - versions = versions.mix(mash_screen_out.versions) // Determine if sample is metagenomic diff --git a/tests/modules/local/rasusa/rasusa.nf.test b/tests/modules/local/rasusa/rasusa.nf.test new file mode 100644 index 00000000..087caad1 --- /dev/null +++ b/tests/modules/local/rasusa/rasusa.nf.test @@ -0,0 +1,39 @@ +/* + Rasusa tests are using the tests provided by nf-core here: https://github.com/nf-core/modules/blob/master/modules/nf-core/rasusa/tests/main.nf.test +*/ + + +nextflow_process { + name "Test RASUSA" + script "modules/local/rasusa.nf" + process "RASUSA" + tag "rasusa" + + + test("Should run without failure") { + + when { + params { + outdir = "rasusa_test1" + modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + } + + process { + """ + input[0] = [ [id:'testfile', single_end:true], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)], + 0.5,] + """ + } + } + + then { + assertAll( + {assert process.success } + ) + } + + } + +} + diff --git a/tests/nextflow.config b/tests/nextflow.config index 1123a15c..1676c104 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -6,7 +6,8 @@ params.max_memory = "2.GB" params.max_cpus = 1 -params.fastp.illumina.args = "-Q" +params.fastp.args.illumina = "-Q" // Might need to remove this param was not being applied originally +params.fastp.args.single_end = "-Q" /* This is required for tests to run in WSL/Ubuntu using singularity Without this, ECTyper was not successfully completing. diff --git a/tests/subworkflows/local/clean_reads/clean_reads.nf.test b/tests/subworkflows/local/clean_reads/clean_reads.nf.test new file mode 100644 index 00000000..c8e6f921 --- /dev/null +++ b/tests/subworkflows/local/clean_reads/clean_reads.nf.test @@ -0,0 +1,256 @@ +nextflow_workflow { + name "Test workflow QC_READS" + script "subworkflows/local/clean_reads.nf" + workflow "QC_READS" + tag "subworkflow" + tag "clean_reads" + + test("Test clean reads run without failure illumina") { + tag "clean_illumina" + + when { + workflow { + """ + input[0] = Channel.of( + [ + [id: "SAMPlE1", + hybrid: false, + sample: "SAMPLE1", + assembly: false, + downsampled: false, + single_end: false, + merge: false], + [ + file("$baseDir/tests/data/reads/campy-staph1.fq.gz"), + file("$baseDir/tests/data/reads/campy-staph2.fq.gz") + ] + ]) + input[1] = "illumina" + """ + } + + params { + outdir = "results" + min_reads = 1 + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + mh_min_kmer = 1 + + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + + kraken2_db = "$baseDir/tests/data/kraken2/test" + + + max_memory = "2.GB" + max_cpus = 1 + } + + } + + then { + + assert workflow.success + assert path("${launchDir}/results/Reads/FinalReads/SAMPLE1/SAMPlE1.final.R1.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/FinalReads/SAMPLE1/SAMPlE1.final.R2.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.R1.deconned.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.R2.deconned.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.R1.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.R2.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/MashSketches/SAMPlE1.mash.estimate.msh").exists() + assert path("${launchDir}/results/Reads/Quality/Trimmed/FastP/SAMPlE1.fastp.summary.html").exists() + assert path("${launchDir}/results/Reads/Quality/Trimmed/FastP/SAMPlE1.fastp.summary.json").exists() + assert path("${launchDir}/results/Reads/Quality/Trimmed/MashScreen/SAMPlE1.mash.screen.reads.screen.screen").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.R1.deconned.reads.fastq.gz").linesGzip.size() == 496 + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.R2.deconned.reads.fastq.gz").linesGzip.size() == 496 + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.R1.trimmed.reads.fastq.gz").linesGzip.size() == 496 + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.R2.trimmed.reads.fastq.gz").linesGzip.size() == 496 + snapshot(workflow.out).match() + + } + + } + + + test("Test clean reads run without failure fake nanopore") { + tag "fake_ont" + + when { + workflow { + """ + input[0] = Channel.of( + [ + [id: "SAMPlE1", + hybrid: false, + assembly: false, + sample: "SAMPLE1", + downsampled: false, + single_end: true, + merge: false], + [ + file("$baseDir/tests/data/reads/campy-staph1.fq.gz"), + ] + ]) + input[1] = "nanopore" + """ + } + + params { + outdir = "results" + min_reads = 1 + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + mh_min_kmer = 1 + + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + + kraken2_db = "$baseDir/tests/data/kraken2/test" + + + max_memory = "2.GB" + max_cpus = 1 + } + + } + + then { + assert workflow.success + assert path("${launchDir}/results/Reads/FinalReads/SAMPLE1/SAMPlE1.final.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.deconned.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/MashSketches/SAMPlE1.mash.estimate.msh").exists() + assert path("${launchDir}/results/Reads/Quality/Trimmed/FastP/SAMPlE1.fastp.summary.html").exists() + assert path("${launchDir}/results/Reads/Quality/Trimmed/FastP/SAMPlE1.fastp.summary.json").exists() + assert path("${launchDir}/results/Reads/Quality/Trimmed/MashScreen/SAMPlE1.mash.screen.reads.screen.screen").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.deconned.reads.fastq.gz").linesGzip.size() == 500 + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.trimmed.reads.fastq.gz").linesGzip.size() == 500 + assert path("${launchDir}/results/Reads/FinalReads/SAMPLE1/SAMPlE1.final.trimmed.reads.fastq.gz").linesGzip.size() == 500 + snapshot(workflow.out).match() + + } + + } + + + test("Test clean reads run without failure fake nanopore (downsampling)") { + tag "fake_ont_downsample" + + when { + workflow { + """ + input[0] = Channel.of( + [ + [id: "SAMPlE1", + hybrid: false, + sample: "SAMPLE1", + assembly: false, + downsampled: false, + single_end: true, + merge: false], + [ + file("$baseDir/tests/data/reads/metagenomic_reads1.fq.gz"), + ], + ]) + input[1] = "nanopore" + """ + } + + params { + outdir = "results" + + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + mh_min_kmer = 1 + + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + + kraken2_db = "$baseDir/tests/data/kraken2/test" + target_depth = 1 + + max_memory = "2.GB" + max_cpus = 1 + } + + } + + then { + assert workflow.success + assert path("${launchDir}/results/Reads/FinalReads/SAMPLE1/SAMPlE1.final.sampled.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.deconned.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/DownSampled/Rasusa/SAMPlE1.rasusa.sample.sampled.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Quality/Trimmed/FastP/SAMPlE1.fastp.summary.html").exists() + assert path("${launchDir}/results/Reads/Quality/Trimmed/FastP/SAMPlE1.fastp.summary.json").exists() + assert path("${launchDir}/results/Reads/FinalReads/SAMPLE1/SAMPlE1.final.sampled.reads.fastq.gz").linesGzip.size() == 5656 + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.deconned.reads.fastq.gz").linesGzip.size() == 16680 + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/DownSampled/Rasusa/SAMPlE1.rasusa.sample.sampled.reads.fastq.gz").linesGzip.size() == 5656 + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.trimmed.reads.fastq.gz").linesGzip.size() == 16680 + snapshot(workflow.out).match() + + } + + } + + test("Test clean reads run without failure illumina (downsampling)") { + tag "fake_ill_downsample" + + when { + workflow { + """ + input[0] = Channel.of( + [ + [id: "SAMPlE1", + hybrid: false, + sample: "SAMPLE1", + assembly: false, + downsampled: false, + single_end: false, + merge: false], + [ + file("$baseDir/tests/data/reads/metagenomic_reads1.fq.gz"), + file("$baseDir/tests/data/reads/metagenomic_reads2.fq.gz") + ], + ]) + input[1] = "illumina" + """ + } + + params { + outdir = "results" + + mash_sketch = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy-staph-ecoli.msh" + mh_min_kmer = 1 + + dehosting_idx = "https://github.com/phac-nml/mikrokondo/raw/dev/tests/data/databases/campy.mmi" + + kraken2_db = "$baseDir/tests/data/kraken2/test" + target_depth = 1 + + max_memory = "2.GB" + max_cpus = 1 + } + + } + + then { + assert workflow.success + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.R1.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.R2.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Quality/Trimmed/FastP/SAMPlE1.fastp.summary.html").exists() + assert path("${launchDir}/results/Reads/Quality/Trimmed/FastP/SAMPlE1.fastp.summary.json").exists() + assert path("${launchDir}/results/Reads/FinalReads/SAMPLE1/SAMPlE1.SAMPlE1_R2.final.sampled.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.R1.deconned.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.R2.deconned.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/DownSampled/SeqTK/SAMPlE1.SAMPlE1_R1.seqtk.sample.sampled.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/DownSampled/SeqTK/SAMPlE1.SAMPlE1_R2.seqtk.sample.sampled.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.R1.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.R2.trimmed.reads.fastq.gz").exists() + assert path("${launchDir}/results/Reads/FinalReads/SAMPLE1/SAMPlE1.SAMPlE1_R1.final.sampled.reads.fastq.gz").linesGzip.size() == 4860 + assert path("${launchDir}/results/Reads/FinalReads/SAMPLE1/SAMPlE1.SAMPlE1_R2.final.sampled.reads.fastq.gz").linesGzip.size() == 4860 + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.R1.deconned.reads.fastq.gz").linesGzip.size() == 16680 + assert path("${launchDir}/results/Reads/Processing/Dehosting/SAMPlE1.deconned.R2.deconned.reads.fastq.gz").linesGzip.size() == 16680 + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/DownSampled/SeqTK/SAMPlE1.SAMPlE1_R1.seqtk.sample.sampled.reads.fastq.gz").linesGzip.size() == 4860 + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/DownSampled/SeqTK/SAMPlE1.SAMPlE1_R2.seqtk.sample.sampled.reads.fastq.gz").linesGzip.size() == 4860 + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.R1.trimmed.reads.fastq.gz").linesGzip.size() == 16680 + assert path("${launchDir}/results/Reads/Processing/Dehosting/Trimmed/FastP/SAMPlE1.fastp.R2.trimmed.reads.fastq.gz").linesGzip.size() == 16680 + snapshot(workflow.out).match() + } + } + +} diff --git a/tests/subworkflows/local/clean_reads/clean_reads.nf.test.snap b/tests/subworkflows/local/clean_reads/clean_reads.nf.test.snap new file mode 100644 index 00000000..0cbdae04 --- /dev/null +++ b/tests/subworkflows/local/clean_reads/clean_reads.nf.test.snap @@ -0,0 +1,1042 @@ +{ + "Test clean reads run without failure illumina": { + "content": [ + { + "0": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false, + "metagenomic": false + }, + [ + "SAMPlE1.R1.trimmed.fastq.gz:md5,b556c9169c14812c9413a66441ef7a52", + "SAMPlE1.R2.trimmed.fastq.gz:md5,2ab187b817ff8afb8f83fdbc51a7e287" + ] + ] + ], + "1": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false + }, + 22797 + ] + ], + "2": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false, + "metagenomic": false + }, + { + "report_tag": "MashMeta" + }, + "false" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false + }, + { + "fastq_ext": ".trimmed.fastq.gz", + "singularity": "https://depot.galaxyproject.org/singularity/fastp%3A0.23.2--hb7a2d85_2", + "docker": "biocontainers/fastp:0.23.2--hb7a2d85_2", + "html_ext": ".html", + "json_ext": ".json", + "report_tag": "FastP", + "average_quality_e": 25, + "cut_tail_mean_quality": 15, + "cut_tail_window_size": 4, + "complexity_threshold": 20, + "qualified_quality_phred": 15, + "unqualified_percent_limit": 40, + "polyg_min_len": 10, + "polyx_min_len": 10, + "illumina_length_min": 35, + "illumina_length_max": 400, + "single_end_length_min": 1000, + "dedup_reads": false, + "args": { + "illumina": "-Q", + "single_end": "-Q" + }, + "report_exclude_fields": [ + "content_curves", + "quality_curves", + "mean", + "kmer_count", + "histogram", + "overrepresented_sequences" + ] + }, + "SAMPlE1.json:md5,b1e96c37c41a9753485d40b1f0cf7033" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false + }, + { + "threshold": 1, + "report_tag": "MeetsReadThreshold" + }, + true + ] + ], + "3": [ + "versions.yml:md5,24650bb863b10ea27510d94eafc0f6ff", + "versions.yml:md5,3879b6c1eb5ca6e84d9f6d67c3b7c97a", + "versions.yml:md5,662de06a73ecf3af0c3a670b6cbdb130", + "versions.yml:md5,9f9fdde5178fcb4d6a69e86c6f7d61b2" + ], + "genome_size": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false + }, + 22797 + ] + ], + "reports": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false, + "metagenomic": false + }, + { + "report_tag": "MashMeta" + }, + "false" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false + }, + { + "fastq_ext": ".trimmed.fastq.gz", + "singularity": "https://depot.galaxyproject.org/singularity/fastp%3A0.23.2--hb7a2d85_2", + "docker": "biocontainers/fastp:0.23.2--hb7a2d85_2", + "html_ext": ".html", + "json_ext": ".json", + "report_tag": "FastP", + "average_quality_e": 25, + "cut_tail_mean_quality": 15, + "cut_tail_window_size": 4, + "complexity_threshold": 20, + "qualified_quality_phred": 15, + "unqualified_percent_limit": 40, + "polyg_min_len": 10, + "polyx_min_len": 10, + "illumina_length_min": 35, + "illumina_length_max": 400, + "single_end_length_min": 1000, + "dedup_reads": false, + "args": { + "illumina": "-Q", + "single_end": "-Q" + }, + "report_exclude_fields": [ + "content_curves", + "quality_curves", + "mean", + "kmer_count", + "histogram", + "overrepresented_sequences" + ] + }, + "SAMPlE1.json:md5,b1e96c37c41a9753485d40b1f0cf7033" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false + }, + { + "threshold": 1, + "report_tag": "MeetsReadThreshold" + }, + true + ] + ], + "trimmed_reads": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false, + "metagenomic": false + }, + [ + "SAMPlE1.R1.trimmed.fastq.gz:md5,b556c9169c14812c9413a66441ef7a52", + "SAMPlE1.R2.trimmed.fastq.gz:md5,2ab187b817ff8afb8f83fdbc51a7e287" + ] + ] + ], + "versions": [ + "versions.yml:md5,24650bb863b10ea27510d94eafc0f6ff", + "versions.yml:md5,3879b6c1eb5ca6e84d9f6d67c3b7c97a", + "versions.yml:md5,662de06a73ecf3af0c3a670b6cbdb130", + "versions.yml:md5,9f9fdde5178fcb4d6a69e86c6f7d61b2" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-02T15:22:59.669978749" + }, + "Test clean reads run without failure fake nanopore (downsampling)": { + "content": [ + { + "0": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": true, + "merge": false, + "metagenomic": false + }, + "SAMPlE1.sampled.fastq.gz:md5,83fc03cad8f5d56ab450a23f071a2752" + ] + ], + "1": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": true, + "merge": false + }, + 353337 + ] + ], + "2": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": true, + "merge": false, + "metagenomic": false + }, + { + "report_tag": "MashMeta" + }, + "false" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": true, + "merge": false + }, + { + "docker": "biocontainers/rasusa:2.1.0--h715e4b3_0", + "singularity": "https://depot.galaxyproject.org/singularity/rasusa:2.1.0--h715e4b3_0", + "reads_ext": ".sampled.fastq.gz", + "seed": 42 + }, + 0.339 + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": true, + "merge": false + }, + { + "fastq_ext": ".trimmed.fastq.gz", + "singularity": "https://depot.galaxyproject.org/singularity/fastp%3A0.23.2--hb7a2d85_2", + "docker": "biocontainers/fastp:0.23.2--hb7a2d85_2", + "html_ext": ".html", + "json_ext": ".json", + "report_tag": "FastP", + "average_quality_e": 25, + "cut_tail_mean_quality": 15, + "cut_tail_window_size": 4, + "complexity_threshold": 20, + "qualified_quality_phred": 15, + "unqualified_percent_limit": 40, + "polyg_min_len": 10, + "polyx_min_len": 10, + "illumina_length_min": 35, + "illumina_length_max": 400, + "single_end_length_min": 1000, + "dedup_reads": false, + "args": { + "illumina": "-Q", + "single_end": "-Q" + }, + "report_exclude_fields": [ + "content_curves", + "quality_curves", + "mean", + "kmer_count", + "histogram", + "overrepresented_sequences" + ] + }, + "SAMPlE1.json:md5,120cf53778f43424edb66c671b3c62f7" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": true, + "merge": false + }, + { + "threshold": 1000, + "report_tag": "MeetsReadThreshold" + }, + true + ] + ], + "3": [ + "versions.yml:md5,24650bb863b10ea27510d94eafc0f6ff", + "versions.yml:md5,3879b6c1eb5ca6e84d9f6d67c3b7c97a", + "versions.yml:md5,662de06a73ecf3af0c3a670b6cbdb130", + "versions.yml:md5,9f9fdde5178fcb4d6a69e86c6f7d61b2", + "versions.yml:md5,db19eb2aca274569d2c27aefdd0b3b73" + ], + "genome_size": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": true, + "merge": false + }, + 353337 + ] + ], + "reports": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": true, + "merge": false, + "metagenomic": false + }, + { + "report_tag": "MashMeta" + }, + "false" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": true, + "merge": false + }, + { + "docker": "biocontainers/rasusa:2.1.0--h715e4b3_0", + "singularity": "https://depot.galaxyproject.org/singularity/rasusa:2.1.0--h715e4b3_0", + "reads_ext": ".sampled.fastq.gz", + "seed": 42 + }, + 0.339 + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": true, + "merge": false + }, + { + "fastq_ext": ".trimmed.fastq.gz", + "singularity": "https://depot.galaxyproject.org/singularity/fastp%3A0.23.2--hb7a2d85_2", + "docker": "biocontainers/fastp:0.23.2--hb7a2d85_2", + "html_ext": ".html", + "json_ext": ".json", + "report_tag": "FastP", + "average_quality_e": 25, + "cut_tail_mean_quality": 15, + "cut_tail_window_size": 4, + "complexity_threshold": 20, + "qualified_quality_phred": 15, + "unqualified_percent_limit": 40, + "polyg_min_len": 10, + "polyx_min_len": 10, + "illumina_length_min": 35, + "illumina_length_max": 400, + "single_end_length_min": 1000, + "dedup_reads": false, + "args": { + "illumina": "-Q", + "single_end": "-Q" + }, + "report_exclude_fields": [ + "content_curves", + "quality_curves", + "mean", + "kmer_count", + "histogram", + "overrepresented_sequences" + ] + }, + "SAMPlE1.json:md5,120cf53778f43424edb66c671b3c62f7" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": true, + "merge": false + }, + { + "threshold": 1000, + "report_tag": "MeetsReadThreshold" + }, + true + ] + ], + "trimmed_reads": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": true, + "merge": false, + "metagenomic": false + }, + "SAMPlE1.sampled.fastq.gz:md5,83fc03cad8f5d56ab450a23f071a2752" + ] + ], + "versions": [ + "versions.yml:md5,24650bb863b10ea27510d94eafc0f6ff", + "versions.yml:md5,3879b6c1eb5ca6e84d9f6d67c3b7c97a", + "versions.yml:md5,662de06a73ecf3af0c3a670b6cbdb130", + "versions.yml:md5,9f9fdde5178fcb4d6a69e86c6f7d61b2", + "versions.yml:md5,db19eb2aca274569d2c27aefdd0b3b73" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-02T15:23:48.645310821" + }, + "Test clean reads run without failure illumina (downsampling)": { + "content": [ + { + "0": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": false, + "merge": false, + "metagenomic": false + }, + [ + "SAMPlE1_R1.sampled.fastq.gz:md5,e6cd4df6e3e943f3a0b4c96c851a0bd1", + "SAMPlE1_R2.sampled.fastq.gz:md5,c58d06431927a8b165a0d4471f260536" + ] + ] + ], + "1": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false + }, + 624151 + ] + ], + "2": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": false, + "merge": false, + "metagenomic": false + }, + { + "report_tag": "MashMeta" + }, + "false" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": false, + "merge": false + }, + { + "fastq_ext": ".trimmed.fastq.gz", + "singularity": "https://depot.galaxyproject.org/singularity/fastp%3A0.23.2--hb7a2d85_2", + "docker": "biocontainers/fastp:0.23.2--hb7a2d85_2", + "html_ext": ".html", + "json_ext": ".json", + "report_tag": "FastP", + "average_quality_e": 25, + "cut_tail_mean_quality": 15, + "cut_tail_window_size": 4, + "complexity_threshold": 20, + "qualified_quality_phred": 15, + "unqualified_percent_limit": 40, + "polyg_min_len": 10, + "polyx_min_len": 10, + "illumina_length_min": 35, + "illumina_length_max": 400, + "single_end_length_min": 1000, + "dedup_reads": false, + "args": { + "illumina": "-Q", + "single_end": "-Q" + }, + "report_exclude_fields": [ + "content_curves", + "quality_curves", + "mean", + "kmer_count", + "histogram", + "overrepresented_sequences" + ] + }, + "SAMPlE1.json:md5,7da5f0f3505f8caa250064d497b3b1ab" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": false, + "merge": false + }, + { + "singularity": "https://depot.galaxyproject.org/singularity/seqtk%3A1.4--he4a0461_1", + "docker": "biocontainers/seqtk:1.4--he4a0461_1", + "seed": 42, + "reads_ext": ".sampled.fastq.gz", + "assembly_fastq": ".fastq.gz", + "report_tag": "Seqtk" + }, + 0.299 + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": false, + "merge": false + }, + { + "threshold": 1000, + "report_tag": "MeetsReadThreshold" + }, + true + ] + ], + "3": [ + "versions.yml:md5,24650bb863b10ea27510d94eafc0f6ff", + "versions.yml:md5,3879b6c1eb5ca6e84d9f6d67c3b7c97a", + "versions.yml:md5,662de06a73ecf3af0c3a670b6cbdb130", + "versions.yml:md5,9f9fdde5178fcb4d6a69e86c6f7d61b2", + "versions.yml:md5,e3e3100fb1f616478648071cd0e2a63e" + ], + "genome_size": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": false, + "single_end": false, + "merge": false + }, + 624151 + ] + ], + "reports": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": false, + "merge": false, + "metagenomic": false + }, + { + "report_tag": "MashMeta" + }, + "false" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": false, + "merge": false + }, + { + "fastq_ext": ".trimmed.fastq.gz", + "singularity": "https://depot.galaxyproject.org/singularity/fastp%3A0.23.2--hb7a2d85_2", + "docker": "biocontainers/fastp:0.23.2--hb7a2d85_2", + "html_ext": ".html", + "json_ext": ".json", + "report_tag": "FastP", + "average_quality_e": 25, + "cut_tail_mean_quality": 15, + "cut_tail_window_size": 4, + "complexity_threshold": 20, + "qualified_quality_phred": 15, + "unqualified_percent_limit": 40, + "polyg_min_len": 10, + "polyx_min_len": 10, + "illumina_length_min": 35, + "illumina_length_max": 400, + "single_end_length_min": 1000, + "dedup_reads": false, + "args": { + "illumina": "-Q", + "single_end": "-Q" + }, + "report_exclude_fields": [ + "content_curves", + "quality_curves", + "mean", + "kmer_count", + "histogram", + "overrepresented_sequences" + ] + }, + "SAMPlE1.json:md5,7da5f0f3505f8caa250064d497b3b1ab" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": false, + "merge": false + }, + { + "singularity": "https://depot.galaxyproject.org/singularity/seqtk%3A1.4--he4a0461_1", + "docker": "biocontainers/seqtk:1.4--he4a0461_1", + "seed": 42, + "reads_ext": ".sampled.fastq.gz", + "assembly_fastq": ".fastq.gz", + "report_tag": "Seqtk" + }, + 0.299 + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": false, + "merge": false + }, + { + "threshold": 1000, + "report_tag": "MeetsReadThreshold" + }, + true + ] + ], + "trimmed_reads": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "sample": "SAMPLE1", + "assembly": false, + "downsampled": true, + "single_end": false, + "merge": false, + "metagenomic": false + }, + [ + "SAMPlE1_R1.sampled.fastq.gz:md5,e6cd4df6e3e943f3a0b4c96c851a0bd1", + "SAMPlE1_R2.sampled.fastq.gz:md5,c58d06431927a8b165a0d4471f260536" + ] + ] + ], + "versions": [ + "versions.yml:md5,24650bb863b10ea27510d94eafc0f6ff", + "versions.yml:md5,3879b6c1eb5ca6e84d9f6d67c3b7c97a", + "versions.yml:md5,662de06a73ecf3af0c3a670b6cbdb130", + "versions.yml:md5,9f9fdde5178fcb4d6a69e86c6f7d61b2", + "versions.yml:md5,e3e3100fb1f616478648071cd0e2a63e" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-02T15:24:12.552717116" + }, + "Test clean reads run without failure fake nanopore": { + "content": [ + { + "0": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "assembly": false, + "sample": "SAMPLE1", + "downsampled": false, + "single_end": true, + "merge": false, + "metagenomic": false + }, + "SAMPlE1.trimmed.fastq.gz:md5,0c0a1017fbf2821190d207bac34d8fbb" + ] + ], + "1": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "assembly": false, + "sample": "SAMPLE1", + "downsampled": false, + "single_end": true, + "merge": false + }, + 16980 + ] + ], + "2": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "assembly": false, + "sample": "SAMPLE1", + "downsampled": false, + "single_end": true, + "merge": false, + "metagenomic": false + }, + { + "report_tag": "MashMeta" + }, + "false" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "assembly": false, + "sample": "SAMPLE1", + "downsampled": false, + "single_end": true, + "merge": false + }, + { + "fastq_ext": ".trimmed.fastq.gz", + "singularity": "https://depot.galaxyproject.org/singularity/fastp%3A0.23.2--hb7a2d85_2", + "docker": "biocontainers/fastp:0.23.2--hb7a2d85_2", + "html_ext": ".html", + "json_ext": ".json", + "report_tag": "FastP", + "average_quality_e": 25, + "cut_tail_mean_quality": 15, + "cut_tail_window_size": 4, + "complexity_threshold": 20, + "qualified_quality_phred": 15, + "unqualified_percent_limit": 40, + "polyg_min_len": 10, + "polyx_min_len": 10, + "illumina_length_min": 35, + "illumina_length_max": 400, + "single_end_length_min": 1000, + "dedup_reads": false, + "args": { + "illumina": "-Q", + "single_end": "-Q" + }, + "report_exclude_fields": [ + "content_curves", + "quality_curves", + "mean", + "kmer_count", + "histogram", + "overrepresented_sequences" + ] + }, + "SAMPlE1.json:md5,ef21c948b21354b81116611b9e4740a2" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "assembly": false, + "sample": "SAMPLE1", + "downsampled": false, + "single_end": true, + "merge": false + }, + { + "threshold": 1, + "report_tag": "MeetsReadThreshold" + }, + true + ] + ], + "3": [ + "versions.yml:md5,24650bb863b10ea27510d94eafc0f6ff", + "versions.yml:md5,3879b6c1eb5ca6e84d9f6d67c3b7c97a", + "versions.yml:md5,662de06a73ecf3af0c3a670b6cbdb130", + "versions.yml:md5,9f9fdde5178fcb4d6a69e86c6f7d61b2" + ], + "genome_size": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "assembly": false, + "sample": "SAMPLE1", + "downsampled": false, + "single_end": true, + "merge": false + }, + 16980 + ] + ], + "reports": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "assembly": false, + "sample": "SAMPLE1", + "downsampled": false, + "single_end": true, + "merge": false, + "metagenomic": false + }, + { + "report_tag": "MashMeta" + }, + "false" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "assembly": false, + "sample": "SAMPLE1", + "downsampled": false, + "single_end": true, + "merge": false + }, + { + "fastq_ext": ".trimmed.fastq.gz", + "singularity": "https://depot.galaxyproject.org/singularity/fastp%3A0.23.2--hb7a2d85_2", + "docker": "biocontainers/fastp:0.23.2--hb7a2d85_2", + "html_ext": ".html", + "json_ext": ".json", + "report_tag": "FastP", + "average_quality_e": 25, + "cut_tail_mean_quality": 15, + "cut_tail_window_size": 4, + "complexity_threshold": 20, + "qualified_quality_phred": 15, + "unqualified_percent_limit": 40, + "polyg_min_len": 10, + "polyx_min_len": 10, + "illumina_length_min": 35, + "illumina_length_max": 400, + "single_end_length_min": 1000, + "dedup_reads": false, + "args": { + "illumina": "-Q", + "single_end": "-Q" + }, + "report_exclude_fields": [ + "content_curves", + "quality_curves", + "mean", + "kmer_count", + "histogram", + "overrepresented_sequences" + ] + }, + "SAMPlE1.json:md5,ef21c948b21354b81116611b9e4740a2" + ], + [ + { + "id": "SAMPlE1", + "hybrid": false, + "assembly": false, + "sample": "SAMPLE1", + "downsampled": false, + "single_end": true, + "merge": false + }, + { + "threshold": 1, + "report_tag": "MeetsReadThreshold" + }, + true + ] + ], + "trimmed_reads": [ + [ + { + "id": "SAMPlE1", + "hybrid": false, + "assembly": false, + "sample": "SAMPLE1", + "downsampled": false, + "single_end": true, + "merge": false, + "metagenomic": false + }, + "SAMPlE1.trimmed.fastq.gz:md5,0c0a1017fbf2821190d207bac34d8fbb" + ] + ], + "versions": [ + "versions.yml:md5,24650bb863b10ea27510d94eafc0f6ff", + "versions.yml:md5,3879b6c1eb5ca6e84d9f6d67c3b7c97a", + "versions.yml:md5,662de06a73ecf3af0c3a670b6cbdb130", + "versions.yml:md5,9f9fdde5178fcb4d6a69e86c6f7d61b2" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-02T15:23:24.663457495" + } +} \ No newline at end of file