diff --git a/CHANGELOG.md b/CHANGELOG.md index 36b2ad811..f667808d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,11 +25,29 @@ Special thanks to the following for their contributions to the release: ### Enhancements & fixes +- [PR #993](https://github.com/nf-core/rnaseq/pull/993) Added NGSCheckMate for checking that samples come from the same individual - [PR #1123](https://github.com/nf-core/rnaseq/pull/1123) - Overhaul tximport.r, output length tables - [PR #1124](https://github.com/nf-core/rnaseq/pull/1124) - Ensure pseudoaligner is set if pseudoalignment is not skipped - [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`. - [PR #1127](https://github.com/nf-core/rnaseq/pull/1127) - Enlarge sampling to determine the number of columns in `filter_gtf.py` script. +### Parameters + +| Old parameter | New parameter | +| ------------- | -------------------- | +| | `--ngscheckmate_bed` | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present. +> **NB:** Parameter has been **added** if just the new parameter information is present. +> **NB:** Parameter has been **removed** if new parameter information isn't present. + +### Software dependencies + +| Dependency | Old version | New version | +| -------------- | ----------- | ----------- | +| `bcftools` | | 1.17 | +| `ngscheckmate` | | 1.0.1 | + ## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17 ### Enhancements and fixes diff --git a/conf/igenomes.config b/conf/igenomes.config index 3f1143775..3099c6d40 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -12,29 +12,31 @@ params { // illumina iGenomes reference file paths genomes { 'GRCh37' { - fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" + fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" + ngscheckmate_bed = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/NGSCheckMate/SNP_GRCh37_hg19_wChr.bed" + readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" + mito_name = "MT" + macs_gsize = "2.7e9" + blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" } 'GRCh38' { - fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" + fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" + ngscheckmate_bed = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/NGSCheckMate/SNP_GRCh38_hg38_wChr.bed" + mito_name = "chrM" + macs_gsize = "2.7e9" + blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" } 'CHM13' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" diff --git a/conf/modules.config b/conf/modules.config index efef36974..78ce76574 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1064,6 +1064,29 @@ if (!params.skip_alignment && !params.skip_qc) { } } +if (params.ngscheckmate_bed) { + process { + withName: ".*BAM_NGSCHECKMATE:BCFTOOLS_MPILEUP" { + ext.args2 = '--no-version --ploidy 1 -c' + ext.args3 = '--no-version' + publishDir = [ + path: { "${params.outdir}/${params.aligner}/ngscheckmate/vcfs" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: ".*BAM_NGSCHECKMATE:NGSCHECKMATE_NCM" { + ext.args = '-V' + publishDir = [ + path: { "${params.outdir}/${params.aligner}/ngscheckmate/output" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } +} + if (!params.skip_multiqc) { process { withName: 'MULTIQC' { diff --git a/conf/test.config b/conf/test.config index f9154ba31..b9c9e4f63 100644 --- a/conf/test.config +++ b/conf/test.config @@ -34,6 +34,8 @@ params { hisat2_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/hisat2.tar.gz" salmon_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/salmon.tar.gz" rsem_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/rsem.tar.gz" + rsem_index = "https://raw.githubusercontent.com/nf-core/test-datasets/7f1614baeb0ddf66e60be78c3d9fa55440465ac8/reference/rsem.tar.gz" + ngscheckmate_bed = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/ngscheckmate.bed" // Other parameters skip_bbsplit = false diff --git a/docs/output.md b/docs/output.md index a4c559b3b..02d05f10b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -669,6 +669,24 @@ The plot on the left hand side shows the standard PC plot - notice the variable Results generated by MultiQC collate pipeline QC from supported tools i.e. FastQC, Cutadapt, SortMeRNA, STAR, RSEM, HISAT2, Salmon, SAMtools, Picard, RSeQC, Qualimap, Preseq and featureCounts. Additionally, various custom content has been added to the report to assess the output of dupRadar, DESeq2 and featureCounts biotypes, and to highlight samples failing a mimimum mapping threshold or those that failed to match the strand-specificity provided in the input samplesheet. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +### NGSCheckMate + +
+Output files + +- `ngscheckmate/vcf` + - `*.vcf`: vcf files containing the SNPs called for each sample. +- `ngscheckmate/output` + - `*.pdf`: Plotted dendrogram showing which samples are considered to be matching. + - `*_corr_matrix.txt`: A correlation matrix showing the correlations between each sample. Correlations below the threshold are set to 0. + - `*_matched.txt`: Text file denoting which samples were considered to be matching. One line per match, stating the correlation and the coverage depth (of the sample in the pair with lowest number of reads). + - `*_corr_matrix.txt`: Text file denoting all the individual comparisons. One line per comparison, stating the correlation and the coverage depth (of the sample in the pair with lowest depth). + +
+ +[NGSCheckMate](https://github.com/parklab/NGSCheckMate) is a tool to verify that samples come from the same individual, by examining a set of single nucleotide polymorphisms (SNPs). This calculates correlations between the samples, and then applies a depth-dependent model of allele fractions to call samples as being related or not. The principal outputs are a dendrogram, where samples that are considered to match are shown as connected, a matrix showing the correlations between each samples, and a text file detailing each match between files. +This requires a bed file specifying the SNPs to consider to be provided as input. The NGSCheckMate github provides such sets of bed files for hg19 and hg38 that have been selected as being typically heterogeneous across the population and are present in exonic regions. The tool can also be used to verify samples match between different sequencing modalities, for instance matching RNA-Seq with ATAC-seq, ChIP-seq and WGS. + ## Pseudoalignment and quantification ### Pseudoalignment diff --git a/modules.json b/modules.json index 74bd0d197..de320b61c 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "de3e6fc949dcffb8d3508c015f435ace5773ff08", "installed_by": ["modules"] }, + "bcftools/mpileup": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["bam_ngscheckmate"] + }, "cat/fastq": { "branch": "master", "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", @@ -75,6 +80,12 @@ "git_sha": "bdc2a97ced7adc423acfa390742db83cab98c1ad", "installed_by": ["modules"] }, + "ngscheckmate/ncm": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["bam_ngscheckmate"], + "patch": "modules/nf-core/ngscheckmate/ncm/ngscheckmate-ncm.diff" + }, "picard/markduplicates": { "branch": "master", "git_sha": "2ee934606f1fdf7fc1cb05d6e8abc13bec8ab448", @@ -248,6 +259,11 @@ "git_sha": "dedc0e31087f3306101c38835d051bf49789445a", "installed_by": ["subworkflows"] }, + "bam_ngscheckmate": { + "branch": "master", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "installed_by": ["subworkflows"] + }, "bam_rseqc": { "branch": "master", "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", diff --git a/modules/nf-core/bcftools/mpileup/environment.yml b/modules/nf-core/bcftools/mpileup/environment.yml new file mode 100644 index 000000000..346d187fe --- /dev/null +++ b/modules/nf-core/bcftools/mpileup/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_mpileup +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.17 diff --git a/modules/nf-core/bcftools/mpileup/main.nf b/modules/nf-core/bcftools/mpileup/main.nf new file mode 100644 index 000000000..83bec8ef5 --- /dev/null +++ b/modules/nf-core/bcftools/mpileup/main.nf @@ -0,0 +1,58 @@ +process BCFTOOLS_MPILEUP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(bam), path(intervals) + tuple val(meta2), path(fasta) + val save_mpileup + + output: + tuple val(meta), path("*vcf.gz") , emit: vcf + tuple val(meta), path("*vcf.gz.tbi") , emit: tbi + tuple val(meta), path("*stats.txt") , emit: stats + tuple val(meta), path("*.mpileup.gz"), emit: mpileup, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def mpileup = save_mpileup ? "| tee ${prefix}.mpileup" : "" + def bgzip_mpileup = save_mpileup ? "bgzip ${prefix}.mpileup" : "" + def intervals = intervals ? "-T ${intervals}" : "" + """ + echo "${meta.id}" > sample_name.list + + bcftools \\ + mpileup \\ + --fasta-ref $fasta \\ + $args \\ + $bam \\ + $intervals \\ + $mpileup \\ + | bcftools call --output-type v $args2 \\ + | bcftools reheader --samples sample_name.list \\ + | bcftools view --output-file ${prefix}.vcf.gz --output-type z $args3 + + $bgzip_mpileup + + tabix -p vcf -f ${prefix}.vcf.gz + + bcftools stats ${prefix}.vcf.gz > ${prefix}.bcftools_stats.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/mpileup/meta.yml b/modules/nf-core/bcftools/mpileup/meta.yml new file mode 100644 index 000000000..65410ddd6 --- /dev/null +++ b/modules/nf-core/bcftools/mpileup/meta.yml @@ -0,0 +1,70 @@ +name: bcftools_mpileup +description: Compresses VCF files +keywords: + - variant calling + - mpileup + - VCF +tools: + - mpileup: + description: | + Generates genotype likelihoods at each genomic position with coverage. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Input BAM file + pattern: "*.{bam}" + - intervals: + type: file + description: Input intervals file. A file (commonly '.bed') containing regions to subset + - meta: + type: map + description: | + Groovy Map containing information about the genome fasta, e.g. [ id: 'sarscov2' ] + - fasta: + type: file + description: FASTA reference file + pattern: "*.{fasta,fa}" + - save_mpileup: + type: boolean + description: Save mpileup file generated by bcftools mpileup +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF gzipped output file + pattern: "*.{vcf.gz}" + - tbi: + type: file + description: tabix index file + pattern: "*.{vcf.gz.tbi}" + - stats: + type: file + description: Text output file containing stats + pattern: "*{stats.txt}" + - mpileup: + type: file + description: mpileup gzipped output for all positions + pattern: "{*.mpileup.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/ngscheckmate/ncm/environment.yml b/modules/nf-core/ngscheckmate/ncm/environment.yml new file mode 100644 index 000000000..bf185fc23 --- /dev/null +++ b/modules/nf-core/ngscheckmate/ncm/environment.yml @@ -0,0 +1,7 @@ +name: ngscheckmate_ncm +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::ngscheckmate=1.0.1 diff --git a/modules/nf-core/ngscheckmate/ncm/main.nf b/modules/nf-core/ngscheckmate/ncm/main.nf new file mode 100644 index 000000000..99921ddcc --- /dev/null +++ b/modules/nf-core/ngscheckmate/ncm/main.nf @@ -0,0 +1,64 @@ +process NGSCHECKMATE_NCM { + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ngscheckmate:1.0.1--py27pl5321r40hdfd78af_1': + 'biocontainers/ngscheckmate:1.0.1--py27pl5321r40hdfd78af_1' }" + + input: + tuple val(meta) , path(files) + tuple val(meta2), path(snp_bed) + tuple val(meta3), path(fasta) + + output: + tuple val(meta), path("*_corr_matrix.txt"), emit: corr_matrix + tuple val(meta), path("*_matched.txt") , emit: matched + tuple val(meta), path("*_all.txt") , emit: all + tuple val(meta), path("*.pdf") , emit: pdf, optional: true + tuple val(meta), path("*.vcf") , emit: vcf, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "$meta.id" + def unzip = files.any { it.toString().endsWith(".vcf.gz") } + """ + if $unzip + then + for VCFGZ in *.vcf.gz; do + gunzip -cdf \$VCFGZ > \$( basename \$VCFGZ .gz ); + done + fi + + NCM_REF="./"${fasta} ncm.py -d . -bed ${snp_bed} -O . -N ${prefix} $args + + if $unzip + then + rm -f *.vcf # clean up decompressed vcfs + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ngscheckmate: \$(ncm.py --help | sed "7!d;s/ *Ensuring Sample Identity v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "$meta.id" + """ + touch ${prefix}_output_corr_matrix.txt + touch ${prefix}_matched.txt + touch ${prefix}_all.txt + touch ${prefix}.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ngscheckmate: \$(ncm.py --help | sed "7!d;s/ *Ensuring Sample Identity v//g") + END_VERSIONS + """ + +} diff --git a/modules/nf-core/ngscheckmate/ncm/meta.yml b/modules/nf-core/ngscheckmate/ncm/meta.yml new file mode 100644 index 000000000..0defad006 --- /dev/null +++ b/modules/nf-core/ngscheckmate/ncm/meta.yml @@ -0,0 +1,71 @@ +name: ngscheckmate_ncm +description: Determining whether sequencing data comes from the same individual by using SNP matching. Designed for humans on vcf or bam files. +keywords: + - ngscheckmate + - matching + - snp +tools: + - ngscheckmate: + description: NGSCheckMate is a software package for identifying next generation sequencing (NGS) data files from the same individual, including matching between DNA and RNA. + homepage: https://github.com/parklab/NGSCheckMate + documentation: https://github.com/parklab/NGSCheckMate + tool_dev_url: https://github.com/parklab/NGSCheckMate + doi: "10.1093/nar/gkx193" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - files: + type: file + description: VCF or BAM files for each sample, in a merged channel (possibly gzipped). BAM files require an index too. + pattern: "*.{vcf,vcf.gz,bam,bai}" + - meta2: + type: map + description: | + Groovy Map containing SNP information + e.g. [ id:'test' ] + - snp_bed: + type: file + description: BED file containing the SNPs to analyse + pattern: "*.{bed}" + - meta3: + type: map + description: | + Groovy Map containing reference fasta index information + e.g. [ id:'test' ] + - fasta: + type: file + description: fasta file for the genome, only used in the bam mode + pattern: "*.{bed}" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - pdf: + type: file + description: A pdf containing a dendrogram showing how the samples match up + pattern: "*.{pdf}" + - corr_matrix: + type: file + description: A text file containing the correlation matrix between each sample + pattern: "*corr_matrix.txt" + - matched: + type: file + description: A txt file containing only the samples that match with each other + pattern: "*matched.txt" + - all: + type: file + description: A txt file containing all the sample comparisons, whether they match or not + pattern: "*all.txt" + - vcf: + type: file + description: If ran in bam mode, vcf files for each sample giving the SNP calls used + pattern: "*.vcf" +authors: + - "@sppearce" +maintainers: + - "@sppearce" diff --git a/nextflow.config b/nextflow.config index d15d4435c..719f3b255 100644 --- a/nextflow.config +++ b/nextflow.config @@ -92,6 +92,7 @@ params { skip_multiqc = false deseq2_vst = true rseqc_modules = 'bam_stat,inner_distance,infer_experiment,junction_annotation,junction_saturation,read_distribution,read_duplication' + ngscheckmate_bed = null // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 60e6585cf..c0ec0e9ab 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -618,6 +618,12 @@ "type": "boolean", "fa_icon": "fas fa-fast-forward", "description": "Skip all QC steps except for MultiQC." + }, + "ngscheckmate_bed": { + "type": "string", + "format": "path", + "fa_icon": "fas fa-bezier-curve", + "description": "Path to bed file containing a set of SNPs for NGSCheckMate." } } }, diff --git a/subworkflows/nf-core/bam_ngscheckmate/main.nf b/subworkflows/nf-core/bam_ngscheckmate/main.nf new file mode 100644 index 000000000..4dd106f32 --- /dev/null +++ b/subworkflows/nf-core/bam_ngscheckmate/main.nf @@ -0,0 +1,49 @@ +include { BCFTOOLS_MPILEUP } from '../../../modules/nf-core/bcftools/mpileup/main' +include { NGSCHECKMATE_NCM } from '../../../modules/nf-core/ngscheckmate/ncm/main' + +workflow BAM_NGSCHECKMATE { + + take: + ch_input // channel: [ val(meta1), bam/cram ] + ch_snp_bed // channel: [ val(meta2), bed ] + ch_fasta // channel: [ val(meta3), fasta ] + + main: + + ch_versions = Channel.empty() + + ch_input_bed = ch_input.combine(ch_snp_bed.collect()) + // do something to combine the metas? + .map{ input_meta, input_file, bed_meta, bed_file -> + [input_meta, input_file, bed_file] + } + + BCFTOOLS_MPILEUP (ch_input_bed, ch_fasta.collect(), false) + ch_versions = ch_versions.mix(BCFTOOLS_MPILEUP.out.versions) + + BCFTOOLS_MPILEUP + .out + .vcf + .map{meta, vcf -> vcf} // discard individual metas + .collect() // group into one channel + .map{files -> [files]} // make the channel into [vcf1, vcf2, ...] + .set {ch_collected_vcfs} + + ch_snp_bed + .map{meta, bed -> meta} // use the snp_bed file meta as the meta for the merged channel + .combine(ch_collected_vcfs) // add the vcf files after the meta, now looks like [meta, [vcf1, vcf2, ... ] ] + .set {ch_vcfs} + + NGSCHECKMATE_NCM (ch_vcfs, ch_snp_bed, ch_fasta) + ch_versions = ch_versions.mix(NGSCHECKMATE_NCM.out.versions) + + emit: + corr_matrix = NGSCHECKMATE_NCM.out.corr_matrix // channel: [ meta, corr_matrix ] + matched = NGSCHECKMATE_NCM.out.matched // channel: [ meta, matched ] + all = NGSCHECKMATE_NCM.out.all // channel: [ meta, all ] + vcf = BCFTOOLS_MPILEUP.out.vcf // channel: [ meta, vcf ] + pdf = NGSCHECKMATE_NCM.out.pdf // channel: [ meta, pdf ] + versions = ch_versions // channel: [ versions.yml ] + +} + diff --git a/subworkflows/nf-core/bam_ngscheckmate/meta.yml b/subworkflows/nf-core/bam_ngscheckmate/meta.yml new file mode 100644 index 000000000..7de0a114d --- /dev/null +++ b/subworkflows/nf-core/bam_ngscheckmate/meta.yml @@ -0,0 +1,68 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "bam_ngscheckmate" +description: Take a set of bam files and run NGSCheckMate to determine whether samples match with each other, using a set of SNPs. +keywords: + - ngscheckmate + - qc + - bam + - snp +components: + - bcftools/mpileup + - ngscheckmate/ncm +input: + - meta1: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - bam: + type: file + description: BAM files for each sample + pattern: "*.{bam}" + - meta2: + type: map + description: | + Groovy Map containing bed file information + e.g. [ id:'sarscov2' ] + - snp_bed: + type: file + description: BED file containing the SNPs to analyse. NGSCheckMate provides some default ones for hg19/hg38. + pattern: "*.{bed}" + - meta3: + type: map + description: | + Groovy Map containing reference genome meta information + e.g. [ id:'sarscov2' ] + - fasta: + type: file + description: fasta file for the genome + pattern: "*.{fasta}" +output: + - pdf: + type: file + description: A pdf containing a dendrogram showing how the samples match up + pattern: "*.{pdf}" + - corr_matrix: + type: file + description: A text file containing the correlation matrix between each sample + pattern: "*corr_matrix.txt" + - matched: + type: file + description: A txt file containing only the samples that match with each other + pattern: "*matched.txt" + - all: + type: file + description: A txt file containing all the sample comparisons, whether they match or not + pattern: "*all.txt" + - vcf: + type: file + description: vcf files for each sample giving the SNP calls + pattern: "*.vcf" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@SPPearce" +maintainers: + - "@SPPearce" diff --git a/subworkflows/nf-core/bam_ngscheckmate/nextflow.config b/subworkflows/nf-core/bam_ngscheckmate/nextflow.config new file mode 100644 index 000000000..cad9f57cc --- /dev/null +++ b/subworkflows/nf-core/bam_ngscheckmate/nextflow.config @@ -0,0 +1,13 @@ +// IMPORTANT: Add this configuration to your modules.config + +process { + withName: ".*BAM_NGSCHECKMATE:BCFTOOLS_MPILEUP" { + ext.args2 = '--no-version --ploidy 1 -c' + ext.args3 = '--no-version' + } + + withName: ".*BAM_NGSCHECKMATE:NGSCHECKMATE_NCM" { + ext.args = '-V' + } + +} diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index f8def2d10..2ff540fd9 100755 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -40,15 +40,15 @@ if (!params.skip_alignment) { prepareToolIndices << params.aligner } if (!params.skip_pseudo_alignment && params.pseudo_aligner) { prepareToolIndices << params.pseudo_aligner } // Determine whether to filter the GTF or not -def filterGtf = +def filterGtf = (( // Condition 1: Alignment is required and aligner is set !params.skip_alignment && params.aligner - ) || + ) || ( // Condition 2: Pseudoalignment is required and pseudoaligner is set !params.skip_pseudo_alignment && params.pseudo_aligner - ) || + ) || ( // Condition 3: Transcript FASTA file is not provided !params.transcript_fasta @@ -68,6 +68,8 @@ if (params.bam_csi_index) { } } +ch_ngscheckmate_bed = params.ngscheckmate_bed ? Channel.fromPath( params.ngscheckmate_bed, checkIfExists: true ) : Channel.empty() + // Stage dummy file to be used as an optional input where required ch_dummy_file = file("$projectDir/assets/dummy_file.txt", checkIfExists: true) @@ -144,6 +146,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft // // SUBWORKFLOW: Consisting entirely of nf-core/modules // + include { FASTQ_SUBSAMPLE_FQ_SALMON } from '../subworkflows/nf-core/fastq_subsample_fq_salmon' include { FASTQ_FASTQC_UMITOOLS_TRIMGALORE } from '../subworkflows/nf-core/fastq_fastqc_umitools_trimgalore' include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../subworkflows/nf-core/fastq_fastqc_umitools_fastp' @@ -155,6 +158,7 @@ include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS include { BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS as BAM_DEDUP_STATS_SAMTOOLS_UMITOOLS_TRANSCRIPTOME } from '../subworkflows/nf-core/bam_dedup_stats_samtools_umitools' include { BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG as BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_FORWARD } from '../subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig' include { BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG as BEDGRAPH_BEDCLIP_BEDGRAPHTOBIGWIG_REVERSE } from '../subworkflows/nf-core/bedgraph_bedclip_bedgraphtobigwig' +include { BAM_NGSCHECKMATE } from '../subworkflows/nf-core/bam_ngscheckmate' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -768,6 +772,15 @@ workflow RNASEQ { ch_versions = ch_versions.mix(DUPRADAR.out.versions.first()) } + if (params.ngscheckmate_bed) { + BAM_NGSCHECKMATE ( + ch_genome_bam, + ch_ngscheckmate_bed.map{it -> [[id: "NGSCheckMate_bed"], it]}, + PREPARE_GENOME.out.fasta.map{it -> [[id: "genome_fasta"], it]} + ) + ch_versions = ch_versions.mix(BAM_NGSCHECKMATE.out.versions.first()) + } + if (!params.skip_rseqc && rseqc_modules.size() > 0) { BAM_RSEQC ( ch_genome_bam.join(ch_genome_bam_index, by: [0]), @@ -851,7 +864,7 @@ workflow RNASEQ { ch_versions = ch_versions.mix(DESEQ2_QC_PSEUDO.out.versions) } } - + // // MODULE: Pipeline reporting // @@ -922,7 +935,7 @@ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report, pass_mapped_reads, pass_trimmed_reads, pass_strand_check) } - + NfcoreTemplate.dump_parameters(workflow, params) NfcoreTemplate.summary(workflow, params, log, pass_mapped_reads, pass_trimmed_reads, pass_strand_check)