diff --git a/assets/schema_input.json b/assets/schema_input.json index 4a7bd527..ea6d99e5 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -53,6 +53,24 @@ "description": "Run markdup in UMI-aware mode. This applies to Samtools only and requires the UMI to be in the read name.", "default": false }, + "fgumi_aware": { + "meta": ["fgumi_aware"], + "type": "boolean", + "description": "Enable UMI-aware consensus processing through the fgumi branch.", + "default": false + }, + "fgumi_read_structures": { + "meta": ["fgumi_read_structures"], + "type": "string", + "description": "Read structures passed to fgumi extract for this sample.", + "default": null + }, + "fgumi_extract_umis_from_read_names": { + "meta": ["fgumi_extract_umis_from_read_names"], + "type": "boolean", + "description": "Override fgumi extraction from read names for this sample.", + "default": null + }, "skip_trimming": { "meta": ["skip_trimming"], "type": "boolean", diff --git a/assets/schema_sampleinfo.json b/assets/schema_sampleinfo.json index dee4c572..d19bb1c4 100644 --- a/assets/schema_sampleinfo.json +++ b/assets/schema_sampleinfo.json @@ -93,6 +93,24 @@ "description": "Run markdup in UMI-aware mode. This applies to Samtools only and requires the UMI to be in the read name.", "default": false }, + "fgumi_aware": { + "meta": ["fgumi_aware"], + "type": "boolean", + "description": "Enable UMI-aware consensus processing through the fgumi branch.", + "default": false + }, + "fgumi_read_structures": { + "meta": ["fgumi_read_structures"], + "type": "string", + "description": "Read structures passed to fgumi extract for this sample.", + "default": null + }, + "fgumi_extract_umis_from_read_names": { + "meta": ["fgumi_extract_umis_from_read_names"], + "type": "boolean", + "description": "Override fgumi extraction from read names for this sample.", + "default": null + }, "skip_trimming": { "meta": ["skip_trimming"], "type": "boolean", diff --git a/conf/modules.config b/conf/modules.config index becf78e8..257e42b9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -231,6 +231,127 @@ process { } } + //// FGUMI extract (step 1) + withName: '.*FGUMI_EXTRACT' { + cpus = 4 + memory = 16.GB + ext.prefix = { "${meta.id}.fgumi.unmapped" } + ext.args = { + [ + "--sample \"${meta.id}\"", + "--library \"${meta.library ?: meta.id}\"", + "--read-structures ${meta.fgumi_read_structures ?: '+T +T'}", + ((meta.fgumi_extract_umis_from_read_names != null ? meta.fgumi_extract_umis_from_read_names : true) ? "--extract-umis-from-read-names" : ""), + "--compression-level ${params.fgumi_compression_level}", + ].join(" ").trim() + } + } + + //// FGUMI fastq | SNAP | zipper | template sort (step 3) + withName: '.*FGUMI_SNAP_ZIPPER_SORT' { + cpus = 16 + memory = 64.GB + ext.prefix = { "${meta.id}.fgumi" } + ext.args = { + [ + "-b-", + "-sm 20", + params.fgumi_snap_ignore_mismatched_pairs ? "-I" : "", + "-hc-", + "-S id", + "-sa", + "-xf 2", + meta.readgroup ? "-R \"@RG\\t" + meta.readgroup.findResults { rg -> rg.value?.trim() ? "${rg.key}:${rg.value}" : null }.join("\\t") + "\"" : "", + ].join(" ").trim() + } + ext.args2 = { + [ + "--threads ${task.cpus}", + ].join(" ").trim() + } + ext.args3 = { + [ + "--threads ${task.cpus}", + "--max-memory ${params.fgumi_sort_max_memory}", + "--memory-per-thread=${params.fgumi_sort_memory_per_thread ? 'true' : 'false'}", + ].join(" ").trim() + } + ext.args4 = { + [ + "--threads ${task.cpus}", + "--max-memory ${params.fgumi_sort_max_memory}", + "--memory-per-thread=${params.fgumi_sort_memory_per_thread ? 'true' : 'false'}", + ].join(" ").trim() + } + ext.args5 = { + [ + "-@ ${task.cpus}", + ].join(" ").trim() + } + } + + //// FGUMI group (step 4) + withName: '.*FGUMI_GROUP' { + cpus = 8 + memory = 32.GB + ext.prefix = { "${meta.id}.fgumi.group" } + ext.args = { + [ + "--strategy ${meta.fgumi_group_strategy ?: 'adjacency'}", + "--edits ${meta.fgumi_group_edits != null ? meta.fgumi_group_edits : 1}", + "--compression-level ${meta.fgumi_compression_level != null ? meta.fgumi_compression_level : 1}", + "--grouping-metrics ${meta.id}.fgumi.group.grouping_metrics.txt", + "--family-size-histogram ${meta.id}.fgumi.group.family_size_histogram.txt", + ].join(" ").trim() + } + } + + //// FGUMI simplex (step 5) + withName: '.*FGUMI_SIMPLEX' { + cpus = 8 + memory = 32.GB + ext.prefix = { "${meta.id}.fgumi.simplex" } + ext.args = { + [ + "--min-reads ${params.fgumi_simplex_min_reads}", + "--threads ${task.cpus}", + "--queue-memory ${params.fgumi_queue_memory}", + params.fgumi_queue_memory_per_thread ? "--queue-memory-per-thread" : "", + "--compression-level ${params.fgumi_compression_level}", + "--stats ${meta.id}.fgumi.simplex.consensus_metrics.txt", + ].join(" ").trim() + } + } + + //// FGUMI filter + coordinate sort/index (step 7) + withName: '.*FGUMI_FILTER' { + cpus = 4 + memory = 16.GB + ext.prefix = { "${meta.id}.fgumi.filter" } + ext.args = { + [ + "--min-reads 1,1,1", + "--stats ${meta.id}.fgumi.filter.filtering_metrics.txt", + ].join(" ").trim() + } + } + + //// FGUMI coordinate sort/index after filter (step 7) + withName: '.*FGUMI_SORT' { + cpus = 8 + memory = 32.GB + ext.prefix = { "${meta.id}.fgumi.filter" } + ext.args = { + [ + "--order coordinate", + "--write-index", + "--threads ${task.cpus}", + "--max-memory ${params.fgumi_sort_max_memory}", + "--memory-per-thread=${params.fgumi_sort_memory_per_thread ? 'true' : 'false'}", + ].join(" ").trim() + } + } + // coverage //// Mosdepth withName: '.*COVERAGE:MOSDEPTH' { diff --git a/docs/usage.md b/docs/usage.md index f59c809a..7f3055af 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -26,6 +26,7 @@ A `fastq` samplesheet file consisting of paired-end data may look something like aligner: bwamem markdup: bamsormadup umi_aware: false + fgumi_aware: false skip_trimming: false trim_front: 0 trim_tail: 0 @@ -67,6 +68,10 @@ Following table shows the fields that are used by the `fastq` samplesheet: An [example samplesheet](../tests/inputs/test.yml) has been provided with the pipeline. +> [!NOTE] +> `umi_aware` and `fgumi_aware` are independent options. +> Use `umi_aware` for samtools markdup UMI mode, and `fgumi_aware` to run the fgumi consensus branch. + ### Flowcell samplesheet A `flowcell` samplesheet file consisting of one sequencing run may look something like the one below. @@ -102,6 +107,7 @@ A `flowcell` sample info JSON/YML file consisting for one sequencing run may loo aligner: bwamem markdup: bamsormadup umi_aware: false + fgumi_aware: false skip_trimming: false trim_front: 0 trim_tail: 0 diff --git a/main.nf b/main.nf index 64e8d3db..c866d9e4 100644 --- a/main.nf +++ b/main.nf @@ -79,6 +79,9 @@ workflow { rna_junctions = PREPROCESSING.out.rna_junctions align_reports = PREPROCESSING.out.align_reports sormadup_metrics = PREPROCESSING.out.sormadup_metrics + // Additional UMI consensus outputs. + family_size_histogram = PREPROCESSING.out.family_size_histogram + umi_filtered_consensus_bam = PREPROCESSING.out.umi_filtered_consensus_bam mosdepth_global = PREPROCESSING.out.mosdepth_global mosdepth_summary = PREPROCESSING.out.mosdepth_summary mosdepth_regions = PREPROCESSING.out.mosdepth_regions @@ -176,6 +179,17 @@ output { metrics >> (meta.library ? "${meta.library}/${meta.samplename}/${meta.samplename}.duplicate_metrics.txt" : "${meta.samplename}/${meta.samplename}.duplicate_metrics.txt") } } + // UMI consensus artefacts are published per sample next to CRAM outputs. + family_size_histogram { + path { meta, histogram -> + histogram >> (meta.library ? "${meta.library}/${meta.samplename}/${histogram.name}" : "${meta.samplename}/${histogram.name}") + } + } + umi_filtered_consensus_bam { + path { meta, bam -> + bam >> (meta.library ? "${meta.library}/${meta.samplename}/${bam.name}" : "${meta.samplename}/${bam.name}") + } + } mosdepth_global { path { meta, _file -> return (meta.library ? "${meta.library}/${meta.samplename}/" : "${meta.samplename}/") diff --git a/modules/local/fgumi/extract/main.nf b/modules/local/fgumi/extract/main.nf new file mode 100644 index 00000000..89002f13 --- /dev/null +++ b/modules/local/fgumi/extract/main.nf @@ -0,0 +1,40 @@ +process FGUMI_EXTRACT { + tag "$meta.id" + label 'process_medium' + + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/95/954170443a820787c9e02ef2135ebb8ec29c6b03633b0d61b5fafa98c59a1cce/data' + : 'community.wave.seqera.io/library/fgumi_r-base_r-ggplot2_r-scales:09c99070b82c1c28'}" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("${prefix}.bam"), emit: bam + tuple val("${task.process}"), val('fgumi'), eval("fgumi --version | sed 's/^fgumi //;q'"), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + // Derive per-thread queue memory from requested process resources. + def queue_memory_mb = (task.memory.mega / task.cpus * 0.75).intValue() + prefix = task.ext.prefix ?: "${meta.id}.fgumi.unmapped" + + """ + fgumi extract \ + --inputs ${reads} \ + --output ${prefix}.bam \ + --threads ${task.cpus} \ + --queue-memory ${queue_memory_mb} \ + --queue-memory-per-thread \ + ${args} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}.fgumi.unmapped" + """ + touch ${prefix}.bam + """ +} diff --git a/modules/local/fgumi/filter/main.nf b/modules/local/fgumi/filter/main.nf new file mode 100644 index 00000000..b63752da --- /dev/null +++ b/modules/local/fgumi/filter/main.nf @@ -0,0 +1,38 @@ +process FGUMI_FILTER { + tag "$meta.id" + label 'process_medium' + + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/95/954170443a820787c9e02ef2135ebb8ec29c6b03633b0d61b5fafa98c59a1cce/data' + : 'community.wave.seqera.io/library/fgumi_r-base_r-ggplot2_r-scales:09c99070b82c1c28'}" + + input: + tuple val(meta), path(bam), path(fasta) + + output: + tuple val(meta), path("${prefix}.filtered.bam"), emit: bam + tuple val(meta), path("${prefix}.filtering_metrics.txt"), optional: true, emit: filtering_metrics + tuple val("${task.process}"), val('fgumi'), eval("fgumi --version | sed 's/^fgumi //;q'"), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}.fgumi.filter" + + """ + fgumi filter \ + --input ${bam} \ + --output ${prefix}.filtered.bam \ + --ref ${fasta} \ + ${args} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}.fgumi.filter" + """ + touch ${prefix}.filtered.bam + touch ${prefix}.filtering_metrics.txt + """ +} diff --git a/modules/local/fgumi/group/main.nf b/modules/local/fgumi/group/main.nf new file mode 100644 index 00000000..d079db7f --- /dev/null +++ b/modules/local/fgumi/group/main.nf @@ -0,0 +1,44 @@ +process FGUMI_GROUP { + tag "$meta.id" + label 'process_medium' + + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/95/954170443a820787c9e02ef2135ebb8ec29c6b03633b0d61b5fafa98c59a1cce/data' + : 'community.wave.seqera.io/library/fgumi_r-base_r-ggplot2_r-scales:09c99070b82c1c28'}" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("${prefix}.bam"), emit: bam + tuple val(meta), path("${prefix}.grouping_metrics.txt"), optional: true, emit: grouping_metrics + tuple val(meta), path("${prefix}.family_size_histogram.txt"), optional: true, emit: family_size_histogram + tuple val("${task.process}"), val('fgumi'), eval("fgumi --version | sed 's/^fgumi //;q'"), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + // Derive per-thread queue memory from requested process resources. + def queue_memory_mb = (task.memory.mega / task.cpus * 0.75).intValue() + prefix = task.ext.prefix ?: "${meta.id}.fgumi.group" + + """ + fgumi group \ + --input ${bam} \ + --output ${prefix}.bam \ + --threads ${task.cpus} \ + --queue-memory ${queue_memory_mb} \ + --queue-memory-per-thread \ + ${args} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}.fgumi.group" + """ + touch ${prefix}.bam + touch ${prefix}.grouping_metrics.txt + touch ${prefix}.family_size_histogram.txt + """ +} diff --git a/modules/local/fgumi/simplex/main.nf b/modules/local/fgumi/simplex/main.nf new file mode 100644 index 00000000..81dfc29a --- /dev/null +++ b/modules/local/fgumi/simplex/main.nf @@ -0,0 +1,37 @@ +process FGUMI_SIMPLEX { + tag "$meta.id" + label 'process_medium' + + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/95/954170443a820787c9e02ef2135ebb8ec29c6b03633b0d61b5fafa98c59a1cce/data' + : 'community.wave.seqera.io/library/fgumi_r-base_r-ggplot2_r-scales:09c99070b82c1c28'}" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("${prefix}.bam"), emit: bam + tuple val(meta), path("${prefix}.consensus_metrics.txt"), optional: true, emit: consensus_metrics + tuple val("${task.process}"), val('fgumi'), eval("fgumi --version | sed 's/^fgumi //;q'"), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}.fgumi.simplex" + + """ + fgumi simplex \ + --input ${bam} \ + --output ${prefix}.bam \ + ${args} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}.fgumi.simplex" + """ + touch ${prefix}.bam + touch ${prefix}.consensus_metrics.txt + """ +} diff --git a/modules/local/fgumi/snapzippersort/main.nf b/modules/local/fgumi/snapzippersort/main.nf new file mode 100644 index 00000000..0c6ec63b --- /dev/null +++ b/modules/local/fgumi/snapzippersort/main.nf @@ -0,0 +1,66 @@ +process FGUMI_SNAP_ZIPPER_SORT { + tag "$meta.id" + label 'process_high' + + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/24/2466270633749543330f352e34588f142de4988585ce63e7f22ee5ed1ff57450/data' + : 'community.wave.seqera.io/library/fgumi_samtools_snap-aligner:c9ba911435350668'}" + + input: + tuple val(meta), path(unmapped_bam), path(index, stageAs: "index/*"), path(fasta), path(dict) + + output: + tuple val(meta), path("${prefix}.template.bam"), emit: bam + tuple val("${task.process}"), val('fgumi'), eval("fgumi --version | sed 's/^fgumi //;q'"), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def snap_args = task.ext.args ?: '' + def zipper_args = task.ext.args2 ?: '' + def sort_args = task.ext.args3 ?: '' + prefix = task.ext.prefix ?: "${meta.id}.fgumi" + + """ + INDEX=`dirname \$(find -L ./ -name "OverflowTable*" | head -n1)` + [ -z "\$INDEX" ] && echo "Snap index files not found" 1>&2 && exit 1 + + # Ensure zipper and fastq read exactly the same queryname-ordered unmapped stream. + samtools sort \ + -n \ + -@ ${task.cpus} \ + -m 1G \ + -o ${prefix}.unmapped.queryname.bam \ + ${unmapped_bam} + + fgumi fastq --input ${prefix}.unmapped.queryname.bam \ + | snap-aligner paired \ + \$INDEX \ + -pairedInterleavedFastq - \ + -o -sam - \ + -t ${task.cpus} \ + ${snap_args} \ + | samtools sort \ + -n \ + -@ ${task.cpus} \ + -m 1G \ + -O SAM \ + - \ + | fgumi zipper \ + --unmapped ${prefix}.unmapped.queryname.bam \ + --reference ${fasta} \ + ${zipper_args} \ + | fgumi sort \ + --input /dev/stdin \ + --output ${prefix}.template.bam \ + --order template-coordinate \ + ${sort_args} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}.fgumi" + """ + touch ${prefix}.template.bam + """ +} diff --git a/modules/local/fgumi/sort/main.nf b/modules/local/fgumi/sort/main.nf new file mode 100644 index 00000000..abfbf633 --- /dev/null +++ b/modules/local/fgumi/sort/main.nf @@ -0,0 +1,37 @@ +process FGUMI_SORT { + tag "$meta.id" + label 'process_medium' + + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/95/954170443a820787c9e02ef2135ebb8ec29c6b03633b0d61b5fafa98c59a1cce/data' + : 'community.wave.seqera.io/library/fgumi_r-base_r-ggplot2_r-scales:09c99070b82c1c28'}" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("${prefix}.bam"), emit: bam + tuple val(meta), path("${prefix}.bam.bai"), emit: bai + tuple val("${task.process}"), val('fgumi'), eval("fgumi --version | sed 's/^fgumi //;q'"), topic: versions, emit: versions_fgumi + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}.fgumi.filter" + + """ + fgumi sort \ + --input ${bam} \ + --output ${prefix}.bam \ + ${args} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}.fgumi.filter" + """ + touch ${prefix}.bam + touch ${prefix}.bam.bai + """ +} diff --git a/nextflow.config b/nextflow.config index 61685c2d..3d067c34 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,6 +22,17 @@ params { split_fastq = 100000000 genelists = null + // UMI consensus (fgumi) options + fgumi_group_strategy = 'adjacency' + fgumi_group_edits = 1 + fgumi_simplex_min_reads = 1 + fgumi_queue_memory = 768 + fgumi_queue_memory_per_thread= true + fgumi_compression_level = 1 + fgumi_sort_max_memory = '2G' + fgumi_sort_memory_per_thread = true + fgumi_snap_ignore_mismatched_pairs = true + // MultiQC options multiqc_config = null multiqc_title = null diff --git a/nextflow_schema.json b/nextflow_schema.json index e0d5f227..1bbeed1e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -72,6 +72,58 @@ "exists": true, "format": "directory-path", "description": "Directory containing gene list bed files for granular coverage analysis" + }, + "fgumi_group_strategy": { + "type": "string", + "default": "adjacency", + "description": "UMI grouping strategy for fgumi group.", + "enum": ["identity", "edit", "adjacency", "paired"] + }, + "fgumi_group_edits": { + "type": "integer", + "default": 1, + "minimum": 0, + "description": "Maximum UMI edit distance used by fgumi group." + }, + "fgumi_simplex_min_reads": { + "type": "integer", + "default": 1, + "minimum": 1, + "description": "Minimum number of reads required per UMI family for fgumi simplex consensus generation." + }, + "fgumi_queue_memory": { + "type": "integer", + "default": 768, + "minimum": 64, + "description": "fgumi queue-memory budget in MB." + }, + "fgumi_queue_memory_per_thread": { + "type": "boolean", + "default": true, + "description": "Scale fgumi queue-memory by allocated thread count." + }, + "fgumi_compression_level": { + "type": "integer", + "default": 1, + "minimum": 0, + "maximum": 12, + "description": "Compression level for fgumi BAM outputs." + }, + "fgumi_sort_max_memory": { + "type": "string", + "default": "2G", + "pattern": "^\\d+(\\.\\d+)?[KMG]$", + "description": "Memory budget for fgumi sort, for example 2G or 768M." + }, + "fgumi_sort_memory_per_thread": { + "type": "boolean", + "default": true, + "description": "Scale fgumi sort memory by allocated thread count." + }, + "fgumi_snap_ignore_mismatched_pairs": { + "type": "boolean", + "default": true, + "description": "Pass -I to SNAP to ignore mismatched read IDs in paired-end input." } } }, diff --git a/subworkflows/local/fastq_to_aligned_cram/main.nf b/subworkflows/local/fastq_to_aligned_cram/main.nf index dfccf2d5..024256f8 100644 --- a/subworkflows/local/fastq_to_aligned_cram/main.nf +++ b/subworkflows/local/fastq_to_aligned_cram/main.nf @@ -13,6 +13,7 @@ include { SAMTOOLS_SORT } from "../../../modules/nf-core/samtools/sort/m // SUBWORKFLOWS include { FASTQ_ALIGN_DNA } from '../../nf-core/fastq_align_dna/main' include { FASTQ_ALIGN_RNA } from '../../local/fastq_align_rna/main' +include { UMI_CONSENSUS_FGUMI } from '../../local/umi_consensus/main.nf' // FUNCTIONS include { getGenomeAttribute } from '../../local/utils_nfcore_preprocessing_pipeline' @@ -42,12 +43,27 @@ workflow FASTQ_TO_CRAM { } .set { ch_meta_reads_aligner_index_fasta_datatype } - // align fastq files per sample - // ALIGNMENT([meta,fastq], index, sort) + ch_meta_reads_aligner_index_fasta_datatype.dna + .branch { meta, reads, aligner, index, fasta -> + // fgumi consensus is opt-in via fgumi_aware to avoid changing samtools umi_aware semantics. + umi: meta.fgumi_aware == true + return [meta, reads, aligner, index, fasta] + non_umi: true + return [meta, reads, aligner, index, fasta] + } + .set { ch_dna_to_align } + + // Align non-UMI DNA fastq files per sample FASTQ_ALIGN_DNA( - ch_meta_reads_aligner_index_fasta_datatype.dna, + ch_dna_to_align.non_umi, false, ) + + // UMI-aware fgumi branch (steps 1, 3, 4, 5, 6, 7 in fgumi Basic Workflow) + UMI_CONSENSUS_FGUMI( + ch_dna_to_align.umi + ) + FASTQ_ALIGN_RNA( ch_meta_reads_aligner_index_fasta_datatype.rna ) @@ -100,6 +116,17 @@ workflow FASTQ_TO_CRAM { ch_markdup_index = channel.empty() + // UMI branch outputs are mixed into the common markdup/metrics streams. + ch_markdup_index = ch_markdup_index.mix( + UMI_CONSENSUS_FGUMI.out.bam_bai + ) + ch_sormadup_metrics = ch_sormadup_metrics.mix(UMI_CONSENSUS_FGUMI.out.grouping_metrics) + ch_sormadup_metrics = ch_sormadup_metrics.mix(UMI_CONSENSUS_FGUMI.out.family_size_histogram) + ch_sormadup_metrics = ch_sormadup_metrics.mix(UMI_CONSENSUS_FGUMI.out.consensus_metrics) + ch_sormadup_metrics = ch_sormadup_metrics.mix(UMI_CONSENSUS_FGUMI.out.filtering_metrics) + ch_family_size_histogram = UMI_CONSENSUS_FGUMI.out.family_size_histogram + ch_filtered_consensus_bam = UMI_CONSENSUS_FGUMI.out.filtered_consensus_bam + // BIOBAMBAM_BAMSORMADUP([meta, [bam, bam]], fasta, fai) BIOBAMBAM_BAMSORMADUP(ch_bam_fasta.bamsormadup) ch_markdup_index = ch_markdup_index.mix(BIOBAMBAM_BAMSORMADUP.out.bam.join(BIOBAMBAM_BAMSORMADUP.out.bam_index, failOnMismatch: true, failOnDuplicate: true)) @@ -149,8 +176,11 @@ workflow FASTQ_TO_CRAM { emit: cram_crai = ch_cram_crai + // UMI-specific output channels for downstream reporting and publishing. + filtered_consensus_bam = ch_filtered_consensus_bam rna_splice_junctions = FASTQ_ALIGN_RNA.out.splice_junctions rna_junctions = FASTQ_ALIGN_RNA.out.junctions sormadup_metrics = ch_sormadup_metrics + family_size_histogram = ch_family_size_histogram align_reports = FASTQ_ALIGN_DNA.out.reports } diff --git a/subworkflows/local/umi_consensus/main.nf b/subworkflows/local/umi_consensus/main.nf new file mode 100644 index 00000000..476df3e2 --- /dev/null +++ b/subworkflows/local/umi_consensus/main.nf @@ -0,0 +1,66 @@ +#!/usr/bin/env nextflow + +// MODULES +include { FGUMI_EXTRACT } from "../../../modules/local/fgumi/extract/main.nf" +include { FGUMI_FILTER } from "../../../modules/local/fgumi/filter/main.nf" +include { FGUMI_GROUP } from "../../../modules/local/fgumi/group/main.nf" +include { FGUMI_SIMPLEX } from "../../../modules/local/fgumi/simplex/main.nf" +include { FGUMI_SNAP_ZIPPER_SORT } from "../../../modules/local/fgumi/snapzippersort/main.nf" +include { FGUMI_SORT } from "../../../modules/local/fgumi/sort/main.nf" + +// FUNCTIONS +include { getGenomeAttribute } from '../../local/utils_nfcore_preprocessing_pipeline' + +workflow UMI_CONSENSUS_FGUMI { + take: + ch_meta_reads_aligner_index_fasta // channel: [mandatory] [meta, reads, aligner, index, fasta] + + main: + // Step 1: build an unmapped BAM with UMI tags from input FASTQ. + FGUMI_EXTRACT( + ch_meta_reads_aligner_index_fasta + .map { meta, reads, _aligner, _index, _fasta -> [meta, reads] } + ) + + // Step 3: align with SNAP, zipper tags back, then template-coordinate sort. + FGUMI_SNAP_ZIPPER_SORT( + FGUMI_EXTRACT.out.bam + .join( + ch_meta_reads_aligner_index_fasta.map { meta, _reads, _aligner, _index, fasta -> + [meta, getGenomeAttribute(meta.genome_data, 'snap'), fasta, getGenomeAttribute(meta.genome_data, 'dict')] + }, + by: 0, + ) + .map { meta, unmapped_bam, snap_index, fasta, dict -> [meta, unmapped_bam, snap_index, fasta, dict] } + ) + + FGUMI_GROUP( + FGUMI_SNAP_ZIPPER_SORT.out.bam + ) + + FGUMI_SIMPLEX( + FGUMI_GROUP.out.bam + ) + + // Step 7: filter consensus reads, then coordinate-sort/index for downstream CRAM conversion. + FGUMI_FILTER( + FGUMI_SIMPLEX.out.bam + .join( + ch_meta_reads_aligner_index_fasta.map { meta, _reads, _aligner, _index, fasta -> [meta, fasta] }, + by: 0, + ) + .map { meta, bam, fasta -> [meta, bam, fasta] } + ) + + FGUMI_SORT( + FGUMI_FILTER.out.bam + ) + + emit: + bam_bai = FGUMI_SORT.out.bam.join(FGUMI_SORT.out.bai, failOnMismatch: true, failOnDuplicate: true) + grouping_metrics = FGUMI_GROUP.out.grouping_metrics + family_size_histogram = FGUMI_GROUP.out.family_size_histogram + consensus_metrics = FGUMI_SIMPLEX.out.consensus_metrics + filtering_metrics = FGUMI_FILTER.out.filtering_metrics + filtered_consensus_bam = FGUMI_SORT.out.bam +} diff --git a/tests/inputs/fgumi/snap_index/OverflowTable.txt b/tests/inputs/fgumi/snap_index/OverflowTable.txt new file mode 100644 index 00000000..48cdce85 --- /dev/null +++ b/tests/inputs/fgumi/snap_index/OverflowTable.txt @@ -0,0 +1 @@ +placeholder diff --git a/tests/inputs/test.yml b/tests/inputs/test.yml index 3a432b4b..c09566ea 100644 --- a/tests/inputs/test.yml +++ b/tests/inputs/test.yml @@ -49,3 +49,18 @@ run_coverage: true fastq_1: https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/test_R1.fastq.gz fastq_2: https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/test_R2.fastq.gz +# UMI consensus (fgumi) inputs +# Example DNA sample with fgumi_aware enabled for fgumi processing. +- id: sample1 + samplename: sample1-chr21 + library: test_library + organism: Homo sapiens + tag: WES + sample_type: DNA + aligner: snap + markdup: bamsormadup + fgumi_aware: true + run_coverage: true + fastq_1: https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/sample1_S31_R1_001.fastq.gz + fastq_2: https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/sample1_S31_R2_001.fastq.gz + diff --git a/tests/modules/local/fgumi/extract/main.nf.test b/tests/modules/local/fgumi/extract/main.nf.test new file mode 100644 index 00000000..40cf1af8 --- /dev/null +++ b/tests/modules/local/fgumi/extract/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process FGUMI_EXTRACT" + script "modules/local/fgumi/extract/main.nf" + process "FGUMI_EXTRACT" + + tag "modules" + tag "modules/local" + tag "modules/local/fgumi" + tag "modules/local/fgumi/extract" + + test("test - stub") { + // Stub-mode contract test: verifies extract output channel and versions tuple. + options "-stub" + + when { + process { + """ + input[0] = [ + [id: "test", samplename: "test", library: "lib1"], + [ + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/sample1_S31_R1_001.fastq.gz", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/fastq/sample1_S31_R2_001.fastq.gz", checkIfExists: true) + ] + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/tests/modules/local/fgumi/extract/main.nf.test.snap b/tests/modules/local/fgumi/extract/main.nf.test.snap new file mode 100644 index 00000000..82b09e4b --- /dev/null +++ b/tests/modules/local/fgumi/extract/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "test - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "samplename": "test", + "library": "lib1" + }, + "test.fgumi.unmapped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "FGUMI_EXTRACT", + "fgumi", + "0.1.2" + ] + ], + "bam": [ + [ + { + "id": "test", + "samplename": "test", + "library": "lib1" + }, + "test.fgumi.unmapped.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_EXTRACT", + "fgumi", + "0.1.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-02T15:59:46.342116682" + } +} \ No newline at end of file diff --git a/tests/modules/local/fgumi/filter/main.nf.test b/tests/modules/local/fgumi/filter/main.nf.test new file mode 100644 index 00000000..51c6f8c8 --- /dev/null +++ b/tests/modules/local/fgumi/filter/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process FGUMI_FILTER" + script "modules/local/fgumi/filter/main.nf" + process "FGUMI_FILTER" + + tag "modules" + tag "modules/local" + tag "modules/local/fgumi" + tag "modules/local/fgumi/filter" + + test("test - stub") { + // Stub-mode contract test: verifies filtered BAM and metrics outputs. + options "-stub" + + when { + process { + """ + input[0] = [ + [id: "test"], + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/bam/consensus.bam", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/tests/modules/local/fgumi/filter/main.nf.test.snap b/tests/modules/local/fgumi/filter/main.nf.test.snap new file mode 100644 index 00000000..7f59c001 --- /dev/null +++ b/tests/modules/local/fgumi/filter/main.nf.test.snap @@ -0,0 +1,59 @@ +{ + "test - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fgumi.filter.filtered.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.fgumi.filter.filtering_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + "FGUMI_FILTER", + "fgumi", + "0.1.2" + ] + ], + "bam": [ + [ + { + "id": "test" + }, + "test.fgumi.filter.filtered.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "filtering_metrics": [ + [ + { + "id": "test" + }, + "test.fgumi.filter.filtering_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_FILTER", + "fgumi", + "0.1.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-08T10:07:36.721154372" + } +} \ No newline at end of file diff --git a/tests/modules/local/fgumi/group/main.nf.test b/tests/modules/local/fgumi/group/main.nf.test new file mode 100644 index 00000000..cd54ae2f --- /dev/null +++ b/tests/modules/local/fgumi/group/main.nf.test @@ -0,0 +1,32 @@ +nextflow_process { + + name "Test Process FGUMI_GROUP" + script "modules/local/fgumi/group/main.nf" + process "FGUMI_GROUP" + + tag "modules" + tag "modules/local" + tag "modules/local/fgumi" + tag "modules/local/fgumi/group" + + test("test - stub") { + // Stub-mode contract test: verifies grouping and family-size outputs. + options "-stub" + + when { + process { + """ + input[0] = [ + [id: "test"], + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/bam/template.bam", checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/tests/modules/local/fgumi/group/main.nf.test.snap b/tests/modules/local/fgumi/group/main.nf.test.snap new file mode 100644 index 00000000..d242fb4a --- /dev/null +++ b/tests/modules/local/fgumi/group/main.nf.test.snap @@ -0,0 +1,75 @@ +{ + "test - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fgumi.group.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.fgumi.group.grouping_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.fgumi.group.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + "FGUMI_GROUP", + "fgumi", + "0.1.2" + ] + ], + "bam": [ + [ + { + "id": "test" + }, + "test.fgumi.group.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "family_size_histogram": [ + [ + { + "id": "test" + }, + "test.fgumi.group.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "grouping_metrics": [ + [ + { + "id": "test" + }, + "test.fgumi.group.grouping_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_GROUP", + "fgumi", + "0.1.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-02T16:05:08.118138918" + } +} \ No newline at end of file diff --git a/tests/modules/local/fgumi/simplex/main.nf.test b/tests/modules/local/fgumi/simplex/main.nf.test new file mode 100644 index 00000000..9ff5c017 --- /dev/null +++ b/tests/modules/local/fgumi/simplex/main.nf.test @@ -0,0 +1,32 @@ +nextflow_process { + + name "Test Process FGUMI_SIMPLEX" + script "modules/local/fgumi/simplex/main.nf" + process "FGUMI_SIMPLEX" + + tag "modules" + tag "modules/local" + tag "modules/local/fgumi" + tag "modules/local/fgumi/simplex" + + test("test - stub") { + // Stub-mode contract test: verifies simplex BAM and consensus metrics. + options "-stub" + + when { + process { + """ + input[0] = [ + [id: "test"], + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/bam/grouped.bam", checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/tests/modules/local/fgumi/simplex/main.nf.test.snap b/tests/modules/local/fgumi/simplex/main.nf.test.snap new file mode 100644 index 00000000..14ecba3d --- /dev/null +++ b/tests/modules/local/fgumi/simplex/main.nf.test.snap @@ -0,0 +1,59 @@ +{ + "test - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fgumi.simplex.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.fgumi.simplex.consensus_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + "FGUMI_SIMPLEX", + "fgumi", + "0.1.2" + ] + ], + "bam": [ + [ + { + "id": "test" + }, + "test.fgumi.simplex.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "consensus_metrics": [ + [ + { + "id": "test" + }, + "test.fgumi.simplex.consensus_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_SIMPLEX", + "fgumi", + "0.1.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-02T16:05:15.268242503" + } +} \ No newline at end of file diff --git a/tests/modules/local/fgumi/snapzippersort/main.nf.test b/tests/modules/local/fgumi/snapzippersort/main.nf.test new file mode 100644 index 00000000..c3a34cbe --- /dev/null +++ b/tests/modules/local/fgumi/snapzippersort/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process FGUMI_SNAP_ZIPPER_SORT" + script "modules/local/fgumi/snapzippersort/main.nf" + process "FGUMI_SNAP_ZIPPER_SORT" + + tag "modules" + tag "modules/local" + tag "modules/local/fgumi" + tag "modules/local/fgumi/snapzippersort" + + test("test - stub") { + // Stub-mode contract test: verifies template BAM output wiring. + options "-stub" + + when { + process { + """ + input[0] = [ + [id: "test", samplename: "test"], + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/bam/unmapped.bam", checkIfExists: true), + file("${projectDir}/tests/inputs/fgumi/snap_index", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/tests/modules/local/fgumi/snapzippersort/main.nf.test.snap b/tests/modules/local/fgumi/snapzippersort/main.nf.test.snap new file mode 100644 index 00000000..d575f1ef --- /dev/null +++ b/tests/modules/local/fgumi/snapzippersort/main.nf.test.snap @@ -0,0 +1,45 @@ +{ + "test - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "samplename": "test" + }, + "test.fgumi.template.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "FGUMI_SNAP_ZIPPER_SORT", + "fgumi", + "0.1.2" + ] + ], + "bam": [ + [ + { + "id": "test", + "samplename": "test" + }, + "test.fgumi.template.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_SNAP_ZIPPER_SORT", + "fgumi", + "0.1.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-02T16:05:22.614091027" + } +} \ No newline at end of file diff --git a/tests/modules/local/fgumi/sort/main.nf.test b/tests/modules/local/fgumi/sort/main.nf.test new file mode 100644 index 00000000..380727d0 --- /dev/null +++ b/tests/modules/local/fgumi/sort/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process FGUMI_SORT" + script "modules/local/fgumi/sort/main.nf" + process "FGUMI_SORT" + + tag "modules" + tag "modules/local" + tag "modules/local/fgumi" + tag "modules/local/fgumi/sort" + + test("test - stub") { + // Stub-mode contract test: verifies coordinate-sorted BAM/index outputs. + options "-stub" + + when { + process { + """ + input[0] = [ + [id: "test"], + file("https://github.com/nf-cmgg/test-datasets/raw/main/data/genomics/homo_sapiens/illumina/bam/consensus.bam", checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } + +} diff --git a/tests/modules/local/fgumi/sort/main.nf.test.snap b/tests/modules/local/fgumi/sort/main.nf.test.snap new file mode 100644 index 00000000..c597cb17 --- /dev/null +++ b/tests/modules/local/fgumi/sort/main.nf.test.snap @@ -0,0 +1,59 @@ +{ + "test - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fgumi.filter.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.fgumi.filter.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + "FGUMI_SORT", + "fgumi", + "0.1.2" + ] + ], + "bai": [ + [ + { + "id": "test" + }, + "test.fgumi.filter.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam": [ + [ + { + "id": "test" + }, + "test.fgumi.filter.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fgumi": [ + [ + "FGUMI_SORT", + "fgumi", + "0.1.2" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-08T10:07:49.850827528" + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test b/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test new file mode 100644 index 00000000..c03f41e3 --- /dev/null +++ b/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test @@ -0,0 +1,56 @@ +nextflow_workflow { + + name "Test Workflow FASTQ_TO_CRAM UMI fgumi stub" + script "subworkflows/local/fastq_to_aligned_cram/main.nf" + workflow "FASTQ_TO_CRAM" + + tag "subworkflows" + tag "subworkflows/local" + tag "subworkflows/local/fastq_to_aligned_cram" + tag "fgumi" + + test("fastq to cram - fgumi umi-aware - stub") { + // End-to-end UMI branch contract test in stub mode. + options "-stub" + when { + workflow { + """ + input[0] = Channel.of([ + [ + id: "UMI_consensus1", + samplename: "HT1080_chr20", + single_end: false, + sample_type: "DNA", + markdup: "bamsormadup", + fgumi_aware: true, + genome_data: [ + fasta: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + fai: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + dict: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + snap: "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + ] + ], + [ + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/illumina/fastq/sample1_R1.fastq.gz", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/illumina/fastq/sample1_R2.fastq.gz", checkIfExists: true) + ], + "snap", + file("s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/", checkIfExists: true), + file("s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", checkIfExists: true), + [] + ]) + """ + } + } + + then { + assert workflow.success + // Explicitly assert newly exposed UMI channels. + assert workflow.out.family_size_histogram.size() == 1 + assert workflow.out.filtered_consensus_bam.size() == 1 + assert snapshot( + sanitizeOutput(workflow.out, unstableKeys:["cram_crai"]) + ).match() + } + } +} diff --git a/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test.snap b/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test.snap new file mode 100644 index 00000000..55035830 --- /dev/null +++ b/tests/subworkflows/local/fastq_to_aligned_cram/fgumi_umi_stub.nf.test.snap @@ -0,0 +1,162 @@ +{ + "fastq to cram - fgumi umi-aware - stub": { + "content": [ + { + "align_reports": [ + + ], + "cram_crai": [ + [ + { + "fgumi_aware": true, + "genome_data": { + "dict": "ref.dict", + "fai": "ref.fa.fai", + "fasta": "ref.fa", + "snap": "snap_index" + }, + "id": "UMI_consensus1", + "markdup": "bamsormadup", + "sample_type": "DNA", + "samplename": "HT1080_chr20", + "single_end": false + }, + "UMI_consensus1.cram", + "UMI_consensus1.cram.crai" + ] + ], + "family_size_histogram": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.group.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "filtered_consensus_bam": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.filter.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "rna_junctions": [ + + ], + "rna_splice_junctions": [ + + ], + "sormadup_metrics": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.filter.filtering_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.group.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.group.grouping_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.simplex.consensus_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-09T10:20:21.479979029" + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/fastq_to_aligned_cram/main.nf.test b/tests/subworkflows/local/fastq_to_aligned_cram/main.nf.test index 65b68c4c..efaaabd4 100644 --- a/tests/subworkflows/local/fastq_to_aligned_cram/main.nf.test +++ b/tests/subworkflows/local/fastq_to_aligned_cram/main.nf.test @@ -219,4 +219,50 @@ nextflow_workflow { ) } } + + test("fastq to cram - fgumi umi-aware - stub") { + options: "-stub" + when { + workflow { + """ + // [meta, [fq_1,fq_2], aligner, index, fasta] + input[0] = Channel.of([ + [ + id: "UMI_consensus1", + samplename: "HT1080_chr20", + single_end: false, + sample_type: "DNA", + markdup: "bamsormadup", + umi_aware: true, + genome_data: [ + fasta: "${projectDir}/tests/inputs/fgumi/ref.fa", + fai: "${projectDir}/tests/inputs/fgumi/ref.fa.fai", + dict: "${projectDir}/tests/inputs/fgumi/ref.dict", + snap: "${projectDir}/tests/inputs/fgumi/snap_index" + ] + ], + [ + file("${projectDir}/tests/inputs/fgumi/R1.fastq.gz", checkIfExists: true), + file("${projectDir}/tests/inputs/fgumi/R2.fastq.gz", checkIfExists: true) + ], + "snap", + file("${projectDir}/tests/inputs/fgumi/snap_index", checkIfExists: true), + file("${projectDir}/tests/inputs/fgumi/ref.fa", checkIfExists: true), + [] + ]) + """ + } + } + + then { + assertAll( + { + assert workflow.success + assert snapshot( + sanitizeOutput(workflow.out, unstableKeys:["cram_crai"]) + ).match() + } + ) + } + } } diff --git a/tests/subworkflows/local/umi_consensus/main.nf.test b/tests/subworkflows/local/umi_consensus/main.nf.test new file mode 100644 index 00000000..7135f5db --- /dev/null +++ b/tests/subworkflows/local/umi_consensus/main.nf.test @@ -0,0 +1,55 @@ +nextflow_workflow { + + name "Test Workflow UMI_CONSENSUS_FGUMI" + script "subworkflows/local/umi_consensus/main.nf" + workflow "UMI_CONSENSUS_FGUMI" + + tag "subworkflows" + tag "subworkflows/local" + tag "subworkflows/local/umi_consensus" + tag "fgumi" + + test("umi consensus fgumi - stub") { + options "-stub" + when { + workflow { + """ + input[0] = Channel.of([ + [ + id: "UMI_consensus1", + samplename: "HT1080_chr20", + single_end: false, + sample_type: "DNA", + markdup: "bamsormadup", + fgumi_aware: true, + genome_data: [ + fasta: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", + fai: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna.fai", + dict: "s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.dict", + snap: "s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/" + ] + ], + [ + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/illumina/fastq/sample1_R1.fastq.gz", checkIfExists: true), + file("https://github.com/nf-cmgg/test-datasets/raw/preprocessing/data/genomics/homo_sapiens/illumina/fastq/sample1_R2.fastq.gz", checkIfExists: true) + ], + "snap", + file("s3://reference-data/genomes/Hsapiens/GRCh38/snapaligner/", checkIfExists: true), + file("s3://test-data/genomics/homo_sapiens/genome/seq/GCA_000001405.15_GRCh38_full_plus_hs38d1_analysis_set_chr21.fna", checkIfExists: true) + ]) + """ + } + } + + then { + assert workflow.success + assert workflow.out.bam_bai.size() == 1 + assert workflow.out.grouping_metrics.size() == 1 + assert workflow.out.family_size_histogram.size() == 1 + assert workflow.out.consensus_metrics.size() == 1 + assert workflow.out.filtering_metrics.size() == 1 + assert workflow.out.filtered_consensus_bam.size() == 1 + assert snapshot(workflow.out).match() + } + } +} diff --git a/tests/subworkflows/local/umi_consensus/main.nf.test.snap b/tests/subworkflows/local/umi_consensus/main.nf.test.snap new file mode 100644 index 00000000..9dbc33d6 --- /dev/null +++ b/tests/subworkflows/local/umi_consensus/main.nf.test.snap @@ -0,0 +1,267 @@ +{ + "umi consensus fgumi - stub": { + "content": [ + { + "0": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.filter.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "UMI_consensus1.fgumi.filter.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.group.grouping_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.group.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.simplex.consensus_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.filter.filtering_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.filter.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam_bai": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.filter.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "UMI_consensus1.fgumi.filter.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "consensus_metrics": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.simplex.consensus_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "family_size_histogram": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.group.family_size_histogram.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "filtered_consensus_bam": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.filter.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "filtering_metrics": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.filter.filtering_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "grouping_metrics": [ + [ + { + "id": "UMI_consensus1", + "samplename": "HT1080_chr20", + "single_end": false, + "sample_type": "DNA", + "markdup": "bamsormadup", + "fgumi_aware": true, + "genome_data": { + "fasta": "ref.fa:md5,78724096432e1b2702881f0126c656f2", + "fai": "ref.fa.fai:md5,fdd09c27f9ecf4cc7e824fd8407b72b7", + "dict": "ref.dict:md5,759dea0d5194dec0cd4f3e02fadf6e83", + "snap": [ + "OverflowTable.txt:md5,178164f81917b8e87073295a635588de" + ] + } + }, + "UMI_consensus1.fgumi.group.grouping_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-09T10:19:55.500577314" + } +} \ No newline at end of file diff --git a/workflows/preprocessing.nf b/workflows/preprocessing.nf index 5337ae93..17f679ec 100644 --- a/workflows/preprocessing.nf +++ b/workflows/preprocessing.nf @@ -269,7 +269,9 @@ workflow PREPROCESSING { FASTQ_TO_CRAM( ch_meta_reads_aligner_index_fasta_gtf ) + // Collect both standard and UMI-specific metrics for MultiQC. ch_multiqc_files = ch_multiqc_files.mix(FASTQ_TO_CRAM.out.sormadup_metrics) + ch_multiqc_files = ch_multiqc_files.mix(FASTQ_TO_CRAM.out.family_size_histogram) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -430,6 +432,9 @@ workflow PREPROCESSING { rna_junctions = FASTQ_TO_CRAM.out.rna_junctions align_reports = FASTQ_TO_CRAM.out.align_reports sormadup_metrics = FASTQ_TO_CRAM.out.sormadup_metrics + // UMI-specific outputs exposed at workflow level. + family_size_histogram = FASTQ_TO_CRAM.out.family_size_histogram + umi_filtered_consensus_bam = FASTQ_TO_CRAM.out.filtered_consensus_bam mosdepth_global = COVERAGE.out.mosdepth_global mosdepth_summary = COVERAGE.out.mosdepth_summary mosdepth_regions = COVERAGE.out.mosdepth_regions