diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 000000000..faab79dba Binary files /dev/null and b/.DS_Store differ diff --git a/.github/workflows/black_linter.yml b/.github/workflows/black_linter.yml index e0d9b14b4..5b4e490ff 100644 --- a/.github/workflows/black_linter.yml +++ b/.github/workflows/black_linter.yml @@ -11,4 +11,4 @@ jobs: - uses: psf/black@stable with: options: "--check --verbose" - version: "22.3.0" + version: "23.7.0" diff --git a/BALSAMIC/assets/scripts/modify_tnscope_infofield.py b/BALSAMIC/assets/scripts/modify_tnscope_infofield.py new file mode 100644 index 000000000..c1f920e83 --- /dev/null +++ b/BALSAMIC/assets/scripts/modify_tnscope_infofield.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +import vcfpy +import click +import sys +import logging +from typing import List, Optional + +LOG = logging.getLogger(__name__) + + +def summarize_ad_to_dp(ad_list): + """ + Summarizes the AD (allelic depth) field into total DP (read depth). + + Parameters: + ad_list (list): List of read depths supporting each allele, [ref_depth, alt1_depth, alt2_depth, ...] + + Returns: + int: Total read depth (DP) across all alleles. + """ + if ad_list is None: + return 0 # Return 0 if AD field is not present + return sum(ad_list) + + +@click.command() +@click.argument("input_vcf", type=click.Path(exists=True)) +@click.argument("output_vcf", type=click.Path()) +def process_vcf(input_vcf: str, output_vcf: str): + """ + Processes the input VCF file and writes the updated information to the output VCF file. + + INPUT_VCF: Path to the input VCF file. + OUTPUT_VCF: Path to the output VCF file. + """ + + # Open the input VCF file + reader: vcfpy.Reader = vcfpy.Reader.from_path(input_vcf) + + # Ensure the sample name is 'TUMOR' + sample_name: str = reader.header.samples.names[0] + if sample_name != "TUMOR": + LOG.warning( + f"Error: The first sample is named '{sample_name}', but 'TUMOR' is expected." + ) + sys.exit(1) + + # Add AF and DP fields to the header if not already present + if "AF" not in reader.header.info_ids(): + reader.header.add_info_line( + vcfpy.OrderedDict( + [ + ("ID", "AF"), + ("Number", "A"), + ("Type", "Float"), + ("Description", "Allele Frequency"), + ] + ) + ) + + if "DP" not in reader.header.info_ids(): + reader.header.add_info_line( + vcfpy.OrderedDict( + [ + ("ID", "DP"), + ("Number", "1"), + ("Type", "Integer"), + ("Description", "Total Depth"), + ] + ) + ) + + # Open the output VCF file for writing + with vcfpy.Writer.from_path(output_vcf, reader.header) as writer: + # Loop through each record (variant) + for record in reader: + # Get the TUMOR sample data + sample_index: int = reader.header.samples.names.index(sample_name) + tumor_call: vcfpy.Call = record.calls[sample_index] + + # Check and process AD field + tumor_ad: Optional[List[int]] = tumor_call.data.get( + "AD", None + ) # AD is a list [ref_count, alt_count] + if tumor_ad is None: + LOG.warning( + f"Warning: AD field is missing for record at position {record.POS} on {record.CHROM}" + ) + else: + record.INFO["DP"] = summarize_ad_to_dp(tumor_ad) + + # Check and process AF field + tumor_af: Optional[float] = tumor_call.data.get("AF", None) + if tumor_af is None: + LOG.warning( + f"Warning: AF field is missing for record at position {record.POS} on {record.CHROM}" + ) + record.INFO["AF"] = [0.0] # Default AF to 0.0 if missing + else: + record.INFO["AF"] = [tumor_af] # Wrap AF in a list + + # Write the updated record to the output VCF file + writer.write_record(record) + + click.echo(f"VCF file processed and saved to {output_vcf}") + + +if __name__ == "__main__": + process_vcf() diff --git a/BALSAMIC/assets/scripts/sort_vcf.awk b/BALSAMIC/assets/scripts/sort_vcf.awk new file mode 100644 index 000000000..be0fff335 --- /dev/null +++ b/BALSAMIC/assets/scripts/sort_vcf.awk @@ -0,0 +1,16 @@ +#!/usr/bin/awk -f + +BEGIN { + ENVIRON["LC_ALL"] = "en_US.UTF-8" +} + +# If the line starts with a '#', it's a header, so print it as is +$1 ~ /^#/ { + print $0; + next; +} + +# Otherwise, send the body lines to an external sort command +{ + print $0 | "/usr/bin/sort -k1,1V -k2,2n" +} diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json index 236da2337..087d167cf 100644 --- a/BALSAMIC/constants/cluster_analysis.json +++ b/BALSAMIC/constants/cluster_analysis.json @@ -59,6 +59,10 @@ "time": "06:00:00", "n": 8 }, + "picard_qc": { + "time": "06:00:00", + "n": 8 + }, "bwa_mem": { "time": "08:00:00", "n": 6 @@ -71,6 +75,14 @@ "time": "4:00:00", "n": 18 }, + "bcftools_normalise_vcfs": { + "time": "2:00:00", + "n": 2 + }, + "bcftools_concatenate_vcfs": { + "time": "2:00:00", + "n": 2 + }, "cnvkit_segment_CNV_research": { "time": "6:00:00", "n": 18 @@ -116,6 +128,10 @@ "time": "60:00:00", "n": 86 }, + "genmod_score_snvs":{ + "time": "05:00:00", + "n": 8 + }, "manta_germline": { "time": "05:00:00", "n": 16 @@ -172,13 +188,25 @@ "time": "24:00:00", "n": 12 }, - "sentieon_TNhaplotyper": { - "time": "24:00:00", - "n": 36 + "vardict_merge": { + "time": "01:30:00", + "n": 5 }, - "sentieon_TNhaplotyper_tumor_only": { - "time": "24:00:00", - "n": 36 + "vardict_sort": { + "time": "01:30:00", + "n": 1 + }, + "post_process_vardict": { + "time": "01:30:00", + "n": 2 + }, + "vardict_tumor_normal": { + "time": "12:00:00", + "n": 18 + }, + "vardict_tumor_only": { + "time": "10:00:00", + "n": 9 }, "sentieon_TNscope": { "time": "24:00:00", @@ -212,18 +240,6 @@ "time": "06:00:00", "n": 10 }, - "vardict_merge": { - "time": "01:30:00", - "n": 5 - }, - "vardict_tumor_normal": { - "time": "12:00:00", - "n": 18 - }, - "vardict_tumor_only": { - "time": "10:00:00", - "n": 9 - }, "sentieon_bwa_umiextract": { "time": "8:00:00", "n": 36 @@ -244,6 +260,14 @@ "time": "4:00:00", "n": 12 }, + "sentieon_tnscope_tga_t_only": { + "time": "4:00:00", + "n": 16 + }, + "sentieon_tnscope_tga_tumor_normal": { + "time": "5:00:00", + "n": 32 + }, "bcftools_get_somaticINDEL_research": { "time": "1:00:00", "n": 36 @@ -356,6 +380,14 @@ "time": "01:00:00", "n": 8 }, + "bcftools_split_tnscope_variants": { + "time": "01:00:00", + "n": 4 + }, + "modify_tnscope_infofield": { + "time": "01:00:00", + "n": 4 + }, "vcf2cytosure_convert": { "time": "02:00:00", "n": 8 @@ -388,14 +420,6 @@ "time": "03:00:00", "n": 8 }, - "bcftools_filter_vardict_research_tumor_only": { - "time": "04:00:00", - "n": 8 - }, - "bcftools_filter_vardict_clinical_tumor_only": { - "time": "03:00:00", - "n": 8 - }, "bcftools_filter_tnscope_umi_research_tumor_only": { "time": "04:00:00", "n": 8 @@ -404,14 +428,6 @@ "time": ":03:00:00", "n": 8 }, - "bcftools_filter_vardict_research_tumor_normal": { - "time": "04:00:00", - "n": 8 - }, - "bcftools_filter_vardict_clinical_tumor_normal": { - "time": "03:00:00", - "n": 8 - }, "bcftools_filter_TNscope_umi_research_tumor_normal": { "time": "04:00:00", "n": 8 diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py index ff25ca37f..974bfb355 100644 --- a/BALSAMIC/constants/rules.py +++ b/BALSAMIC/constants/rules.py @@ -35,6 +35,7 @@ "snakemake_rules/quality_control/fastqc.rule", "snakemake_rules/quality_control/multiqc.rule", "snakemake_rules/quality_control/qc_metrics.rule", + "snakemake_rules/quality_control/picard_common.rule", "snakemake_rules/quality_control/sentieon_qc_metrics.rule", ], "report": [ @@ -45,7 +46,8 @@ "snakemake_rules/align/bam_compress.rule", ], "varcall": [ - "snakemake_rules/variant_calling/sentieon_quality_filter.rule", + "snakemake_rules/variant_calling/snv_quality_filter.rule", + "snakemake_rules/variant_calling/tnscope_post_process.rule", ], "annotate": [ "snakemake_rules/annotation/somatic_snv_annotation.rule", @@ -55,6 +57,7 @@ "snakemake_rules/annotation/varcaller_sv_filter.rule", "snakemake_rules/annotation/vcf2cytosure_convert.rule", "snakemake_rules/annotation/final_vcf_reheader.rule", + "snakemake_rules/annotation/rankscore.rule", ], }, "single_targeted": { @@ -78,16 +81,18 @@ "snakemake_rules/variant_calling/extend_bed.rule", "snakemake_rules/variant_calling/cnvkit_preprocess.rule", "snakemake_rules/variant_calling/germline_tga.rule", - "snakemake_rules/variant_calling/split_bed.rule", "snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule", - "snakemake_rules/variant_calling/somatic_tumor_only.rule", "snakemake_rules/variant_calling/somatic_sv_tumor_only_tga.rule", "snakemake_rules/umi/sentieon_varcall_tnscope.rule", + "snakemake_rules/umi/modify_tnscope_infofield_umi.rule", + "snakemake_rules/variant_calling/snv_t_varcall_tga.rule", "snakemake_rules/variant_calling/somatic_sv_postprocess_and_filter_tumor_only.rule", + "snakemake_rules/variant_calling/merge_snv_vcfs.rule", + "snakemake_rules/variant_calling/vardict_pre_and_postprocessing.rule", ], "annotate": [ - "snakemake_rules/annotation/rankscore.rule", "snakemake_rules/annotation/varcaller_filter_tumor_only.rule", + "snakemake_rules/annotation/varcaller_filter_tumor_only_umi.rule", ], }, "paired_targeted": { @@ -112,16 +117,18 @@ "snakemake_rules/variant_calling/extend_bed.rule", "snakemake_rules/variant_calling/cnvkit_preprocess.rule", "snakemake_rules/variant_calling/germline_tga.rule", - "snakemake_rules/variant_calling/split_bed.rule", - "snakemake_rules/variant_calling/somatic_tumor_normal.rule", "snakemake_rules/variant_calling/somatic_sv_tumor_normal_tga.rule", "snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule", "snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule", + "snakemake_rules/umi/modify_tnscope_infofield_umi.rule", + "snakemake_rules/variant_calling/snv_tn_varcall_tga.rule", "snakemake_rules/variant_calling/somatic_sv_postprocess_and_filter_tumor_normal.rule", + "snakemake_rules/variant_calling/merge_snv_vcfs.rule", + "snakemake_rules/variant_calling/vardict_pre_and_postprocessing.rule", ], "annotate": [ - "snakemake_rules/annotation/rankscore.rule", "snakemake_rules/annotation/varcaller_filter_tumor_normal.rule", + "snakemake_rules/annotation/varcaller_filter_tumor_normal_umi.rule", "snakemake_rules/annotation/vcfheader_rename.rule", "snakemake_rules/annotation/msi_tumor_normal.rule", ], @@ -138,14 +145,13 @@ ], "varcall": [ "snakemake_rules/variant_calling/germline_wgs.rule", - "snakemake_rules/variant_calling/sentieon_split_snv_sv.rule", - "snakemake_rules/variant_calling/sentieon_t_varcall.rule", + "snakemake_rules/variant_calling/sentieon_t_varcall_wgs.rule", "snakemake_rules/variant_calling/somatic_sv_tumor_only_wgs.rule", "snakemake_rules/dragen_suite/dragen_dna.rule", "snakemake_rules/variant_calling/somatic_sv_postprocess_and_filter_tumor_only.rule", ], "annotate": [ - "snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule", + "snakemake_rules/annotation/varcaller_filter_tumor_only_wgs.rule", ], }, "paired_wgs": { @@ -161,13 +167,12 @@ ], "varcall": [ "snakemake_rules/variant_calling/germline_wgs.rule", - "snakemake_rules/variant_calling/sentieon_split_snv_sv.rule", - "snakemake_rules/variant_calling/sentieon_tn_varcall.rule", + "snakemake_rules/variant_calling/sentieon_tn_varcall_wgs.rule", "snakemake_rules/variant_calling/somatic_sv_tumor_normal_wgs.rule", "snakemake_rules/variant_calling/somatic_sv_postprocess_and_filter_tumor_normal.rule", ], "annotate": [ - "snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule", + "snakemake_rules/annotation/varcaller_filter_tumor_normal_wgs.rule", "snakemake_rules/annotation/vcfheader_rename.rule", "snakemake_rules/annotation/msi_tumor_normal.rule", ], @@ -184,8 +189,8 @@ "multiqc", "collect_custom_qc_metrics", # Alignment - "mergeBam_tumor_umiconsensus", - "mergeBam_normal_umiconsensus", + "bam_compress_tumor_umi", + "bam_compress_normal_umi", "bam_compress_tumor", "bam_compress_normal", # Germline @@ -193,22 +198,19 @@ "vep_annotate_germlineVAR_tumor", "vep_annotate_germlineVAR_normal", # SNVs - "bcftools_view_split_variant", - "bcftools_filter_tnscope_research_tumor_only", - "bcftools_filter_tnscope_research_tumor_normal", + "modify_tnscope_infofield", + "modify_tnscope_infofield_umi", + "gatk_update_vcf_sequence_dictionary", "bcftools_filter_tnscope_clinical_tumor_only", "bcftools_filter_tnscope_clinical_tumor_normal", - "vardict_merge", - "bcftools_filter_vardict_research_tumor_only", - "bcftools_filter_vardict_research_tumor_normal", - "bcftools_filter_vardict_clinical_tumor_only", - "bcftools_filter_vardict_clinical_tumor_normal", + "bcftools_filter_merged_clinical_tumor_only", + "bcftools_filter_merged_clinical_tumor_normal", "sentieon_tnscope_umi", "sentieon_tnscope_umi_tn", - "bcftools_filter_TNscope_umi_research_tumor_only", - "bcftools_filter_TNscope_umi_research_tumor_normal", "bcftools_filter_TNscope_umi_clinical_tumor_only", "bcftools_filter_TNscope_umi_clinical_tumor_normal", + "genmod_score_snvs", + "vep_annotate_somaticSNV_research", # SVs "svdb_merge_tumor_only", "svdb_merge_tumor_normal", diff --git a/BALSAMIC/constants/variant_filters.py b/BALSAMIC/constants/variant_filters.py index 8c3a22535..e1cb0b17b 100644 --- a/BALSAMIC/constants/variant_filters.py +++ b/BALSAMIC/constants/variant_filters.py @@ -1,54 +1,72 @@ -COMMON_SETTINGS = { - "pop_freq": { - "tag_value": 0.005, - "filter_name": "balsamic_high_pop_freq", +# Configuration of common SNV filter settings +SNV_BCFTOOLS_SETTINGS_COMMON = { + "swegen_snv_freq": { + "tag_value": 0.01, + "filter_name": "SWEGENAF", + "field": "INFO", + }, + "loqusdb_clinical_snv_freq": { + "tag_value": 0.01, + "filter_name": "Frq", "field": "INFO", }, + "high_normal_tumor_af_frac": { + "tag_value": 0.3, + "filter_name": "high_normal_tumor_af_frac", + "field": "FORMAT", + }, + "qss": { + "tag_value": 20, + "filter_name": "balsamic_low_quality_scores", + "field": "FORMAT", + }, + "strand_reads": { + "tag_value": 0, + "filter_name": "balsamic_low_strand_read_counts", + "field": "FORMAT", + }, "varcaller_name": "None", "filter_type": "general", "analysis_type": "tumor_only,tumor_normal", - "description": "General purpose filters used for filtering any variant caller", + "description": "General purpose filters used for filtering SNVs", } -# Configuration of common VARDICT settings -VARDICT_SETTINGS_COMMON = { +# Configuration of common TGA SNV filter settings +SNV_BCFTOOLS_SETTINGS_TGA = { + **SNV_BCFTOOLS_SETTINGS_COMMON, "pop_freq": { "tag_value": 0.005, "filter_name": "balsamic_high_pop_freq", "field": "INFO", }, - "swegen_snv_freq": { - "tag_value": 0.01, - "filter_name": "SWEGENAF", + "pop_freq_umi": { + "tag_value": 0.02, + "filter_name": "balsamic_umi_high_pop_freq", "field": "INFO", }, - "loqusdb_clinical_snv_freq": { - "tag_value": 0.01, - "filter_name": "Frq", + "sor": { + "tag_value": 2.7, + "filter_name": "balsamic_high_strand_oddsratio", "field": "INFO", }, - "MQ": {"tag_value": 30, "filter_name": "balsamic_low_mq", "field": "INFO"}, - "AF_min": {"tag_value": 0.007, "filter_name": "balsamic_low_af", "field": "INFO"}, + "AF_min": {"tag_value": 0.005, "filter_name": "balsamic_low_af", "field": "INFO"}, "AD": {"tag_value": 5, "filter_name": "balsamic_low_tumor_ad", "field": "INFO"}, - "varcaller_name": "VarDict", - "filter_type": "general", - "analysis_type": "tumor_only,tumor_normal", - "description": "General purpose filters used for filtering VarDict", + "MQ": {"tag_value": 30, "filter_name": "balsamic_low_mq", "field": "INFO"}, } -# Configuration of VARDICT settings for smaller panels -VARDICT_SETTINGS_PANEL = { - **VARDICT_SETTINGS_COMMON, +# Configuration of unique TGA SNV filter settings for smaller panels +SNV_BCFTOOLS_SETTINGS_PANEL = { + **SNV_BCFTOOLS_SETTINGS_TGA, "DP": { - "tag_value": 100, + "tag_value": 50, "filter_name": "balsamic_low_tumor_dp", "field": "INFO", }, } -# Configuration of VARDICT settings for exomes -VARDICT_SETTINGS_EXOME = { - **VARDICT_SETTINGS_COMMON, +# Configuration of unique TGA SNV filter settings for smaller exomes +SNV_BCFTOOLS_SETTINGS_EXOME = { + **SNV_BCFTOOLS_SETTINGS_TGA, "DP": { "tag_value": 20, "filter_name": "balsamic_low_tumor_dp", @@ -56,8 +74,14 @@ }, } -# Configuration for SENTIEON settings: -SENTIEON_VARCALL_SETTINGS = { +# Configuration of unique WGS SNV filter settings +SNV_BCFTOOLS_SETTINGS_WGS = { + **SNV_BCFTOOLS_SETTINGS_COMMON, + "sor": { + "tag_value": 3, + "filter_name": "balsamic_high_strand_oddsratio", + "field": "INFO", + }, "AD": {"tag_value": 3, "filter_name": "balsamic_low_tumor_ad", "field": "FORMAT"}, "DP": { "tag_value": 10, @@ -65,50 +89,11 @@ "field": "FORMAT", }, "AF_min": {"tag_value": 0.05, "filter_name": "balsamic_low_af", "field": "FORMAT"}, - "high_normal_tumor_af_frac": { - "tag_value": 0.3, - "filter_name": "high_normal_tumor_af_frac", - "field": "FORMAT", - }, "pop_freq": { "tag_value": 0.001, "filter_name": "balsamic_high_pop_freq", "field": "INFO", }, - "pop_freq_umi": { - "tag_value": 0.02, - "filter_name": "balsamic_umi_high_pop_freq", - "field": "INFO", - }, - "qss": { - "tag_value": 20, - "filter_name": "balsamic_low_quality_scores", - "field": "FORMAT", - }, - "strand_reads": { - "tag_value": 0, - "filter_name": "balsamic_low_strand_read_counts", - "field": "FORMAT", - }, - "sor": { - "tag_value": 3, - "filter_name": "balsamic_high_strand_oddsratio", - "field": "INFO", - }, - "swegen_snv_freq": { - "tag_value": 0.01, - "filter_name": "SWEGENAF", - "field": "INFO", - }, - "loqusdb_clinical_snv_freq": { - "tag_value": 0.01, - "filter_name": "Frq", - "field": "INFO", - }, - "varcaller_name": "sentieon", - "filter_type": "general", - "analysis_type": "tumor_only", - "description": "General purpose filters used for filtering tnscope and tnhaplotyper", } # Manta bcftools filters diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py index a3095b640..943ee6f2c 100644 --- a/BALSAMIC/constants/workflow_params.py +++ b/BALSAMIC/constants/workflow_params.py @@ -19,7 +19,7 @@ "mutation": "somatic", "mutation_type": "SNV", "analysis_type": ["paired", "single"], - "sequencing_type": ["wgs"], + "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["Sentieon"], }, "dnascope": { @@ -50,18 +50,18 @@ "sequencing_type": ["targeted"], "workflow_solution": ["BALSAMIC"], }, - "manta_germline": { - "mutation": "germline", - "mutation_type": "SV", + "merged": { + "mutation": "somatic", + "mutation_type": "SNV", "analysis_type": ["paired", "single"], - "sequencing_type": ["targeted", "wgs"], + "sequencing_type": ["targeted"], "workflow_solution": ["BALSAMIC"], }, - "haplotypecaller": { + "manta_germline": { "mutation": "germline", - "mutation_type": "SNV", + "mutation_type": "SV", "analysis_type": ["paired", "single"], - "sequencing_type": ["targeted"], + "sequencing_type": ["targeted", "wgs"], "workflow_solution": ["BALSAMIC"], }, "dellysv": { @@ -178,27 +178,26 @@ "min_base_qual": 10, "cov_threshold": [50, 100, 150, 200, 250], }, - "vardict": { - "allelic_frequency": "0.001", - "max_pval": "0.9", - "max_mm": "4.5", - "column_info": "-c 1 -S 2 -E 3 -g 4", - }, "vep": { "vep_filters": "--compress_output bgzip --vcf --everything --hgvsg --allow_non_variant --dont_skip --buffer_size 30000 --max_sv_size 249250621 --format vcf --offline --variant_class --merged --cache --verbose --force_overwrite" }, - "umicommon": { - "align_intbases": 1000000, - "filter_tumor_af": 0.0005, - }, + "umicommon": {"align_intbases": 1000000}, "umiconsensuscall": { "align_format": "BAM", "filter_minreads": "3,1,1", "tag": "XR", }, "umiextract": {"read_structure": "-d '3M2S+T,3M2S+T'"}, + "vardict": { + "allelic_frequency": "0.001", + "max_pval": "0.9", + "max_mm": "4.5", + "column_info": "-c 1 -S 2 -E 3 -g 4", + }, "tnscope_umi": { "algo": "TNscope", + "filter_tumor_af": 0.0005, + "pcr_model": "NONE", "min_tumorLOD": 4, "init_tumorLOD": 0.5, "error_rate": 5, @@ -206,4 +205,14 @@ "padding": 100, "disable_detect": "sv", }, + "tnscope_tga": { + "algo": "TNscope", + "filter_tumor_af": 0.0005, + "pcr_model": "NONE", + "min_tumorLOD": 4, + "init_tumorLOD": 0.5, + "error_rate": 5, + "prunefactor": 3, + "padding": 100, + }, } diff --git a/BALSAMIC/models/config.py b/BALSAMIC/models/config.py index d08962aa9..69b1c605f 100644 --- a/BALSAMIC/models/config.py +++ b/BALSAMIC/models/config.py @@ -93,12 +93,13 @@ class VarcallerAttribute(BaseModel): class VCFModel(BaseModel): """Contains VCF config""" - vardict: VarcallerAttribute tnscope: VarcallerAttribute dnascope: VarcallerAttribute tnscope_umi: VarcallerAttribute manta_germline: VarcallerAttribute + merged: VarcallerAttribute manta: VarcallerAttribute + vardict: VarcallerAttribute dellysv: VarcallerAttribute cnvkit: VarcallerAttribute ascat: VarcallerAttribute @@ -438,7 +439,7 @@ def get_final_bam_name( # Only dedup is necessary for panel of normals final_bam_suffix = "dedup.fixmate" elif self.analysis.sequencing_type == SequencingType.TARGETED: - # Only dedup is necessary for TGA + # TGA uses UMIs final_bam_suffix = "dedup.fixmate" else: # For WGS the bamfiles are realigned diff --git a/BALSAMIC/models/params.py b/BALSAMIC/models/params.py index 16a2f134f..9b985dab1 100644 --- a/BALSAMIC/models/params.py +++ b/BALSAMIC/models/params.py @@ -78,22 +78,6 @@ def parse_into_arguments(cls, cov_threshold): return " ".join(param_values) -class ParamsVardict(BaseModel): - """This class defines the params settings used as constants in vardict rule. - - Attributes: - allelic_frequency: float (required); minimum allelic frequency to detect - max_pval: float (required); the maximum p-value. Vardict default: 0.05 - max_mm: float (required); the maximum mean mismatches allowed. Vardict default: 5.25 - column_info: str (required); set of vardict filters for passing final variants - """ - - allelic_frequency: float - max_pval: float - max_mm: float - column_info: str - - class ParamsVEP(BaseModel): """This class defines the params settings used as constants in vep rule. @@ -128,13 +112,10 @@ class UMIParamsCommon(BaseModel): """This class defines the common params settings used as constants across various rules in UMI workflow. Attributes: - align_format: str (required); output alignment format. eg. 'BAM' align_intbases: int; input bases in each batch regardless of threads, for reproducibility - filter_tumor_af: float (required); settings to filter minimum allelic frequency """ align_intbases: int - filter_tumor_af: float class UMIParamsUMIextract(BaseModel): @@ -162,7 +143,7 @@ class UMIParamsConsensuscall(BaseModel): class UMIParamsTNscope(BaseModel): - """This class defines the params settings used as constants in UMI workflow- rule tnscope. + """This class defines the params settings used as constants in UMI workflow-rule tnscope. Attributes: algo: str; choice of sentieon varcall algorithm. eg. 'TNscope' @@ -173,15 +154,42 @@ class UMIParamsTNscope(BaseModel): error_rate: int (required); allow error-rate to consider in calling prunefactor: int (required); pruning factor in the kmer graph padding: int(required); amount to pad bed interval regions + pcr_model: str (required). PCR indel model used to weed out false positive indels. Eg: none- PCR free samples. """ algo: str + filter_tumor_af: float init_tumorLOD: float min_tumorLOD: int error_rate: int prunefactor: int padding: int disable_detect: str + pcr_model: str + + +class TGAParamsTNscope(BaseModel): + """This class defines the params settings used as constants in TGA workflow-rule tnscope. + + Attributes: + algo: str; choice of sentieon varcall algorithm. eg. 'TNscope' + filter_tumor_af: float (required); minimum allelic frequency to detect + min_tumorLOD: int (required); minimum tumor log odds in the final call of variants + init_tumorLOD: float (required); minimum tumor log odds in the initial pass calling variants + error_rate: int (required); allow error-rate to consider in calling + prunefactor: int (required); pruning factor in the kmer graph + padding: int(required); amount to pad bed interval regions + pcr_model: str (required). PCR indel model used to weed out false positive indels. Eg: none- PCR free samples. + """ + + algo: str + filter_tumor_af: float + init_tumorLOD: float + min_tumorLOD: int + error_rate: int + prunefactor: int + padding: int + pcr_model: str class BAMPostProcessingParams(BaseModel): @@ -204,6 +212,22 @@ class BEDPreProcessingParams(BaseModel): minimum_region_size: int +class ParamsVardict(BaseModel): + """This class defines the params settings used as constants in vardict rule. + + Attributes: + allelic_frequency: float (required); minimum allelic frequency to detect + max_pval: float (required); the maximum p-value. Vardict default: 0.05 + max_mm: float (required); the maximum mean mismatches allowed. Vardict default: 5.25 + column_info: str (required); set of vardict filters for passing final variants + """ + + allelic_frequency: float + max_pval: float + max_mm: float + column_info: str + + class BalsamicWorkflowConfig(BaseModel): """Defines set of rules in balsamic workflow @@ -216,7 +240,6 @@ class BalsamicWorkflowConfig(BaseModel): mosdepth: params used in mosdepth rule umicommon: global params defined across specific rules in UMI workflow vep: global params defined in the rule vep - vardict: params defined in the rule vardict umiextract : params defined in the rule sentieon_umiextract umiconsensuscall: params defined in the rule sentieon_consensuscall tnscope_umi: params defined in the rule sentieon_tnscope_umi @@ -232,12 +255,13 @@ class BalsamicWorkflowConfig(BaseModel): manta: ParamsManta mosdepth: ParamsMosdepth sentieon_wgs_metrics: ParamsSentieonWGSMetrics - vardict: ParamsVardict vep: ParamsVEP + vardict: ParamsVardict umicommon: UMIParamsCommon umiextract: UMIParamsUMIextract umiconsensuscall: UMIParamsConsensuscall tnscope_umi: UMIParamsTNscope + tnscope_tga: TGAParamsTNscope def get_manta_settings(self, sequencing_type) -> str: """Return correct setting for manta rules depending on sequencing type.""" diff --git a/BALSAMIC/snakemake_rules/align/tga_sentieon_alignment.rule b/BALSAMIC/snakemake_rules/align/tga_sentieon_alignment.rule index e464839d3..2fc85d04e 100644 --- a/BALSAMIC/snakemake_rules/align/tga_sentieon_alignment.rule +++ b/BALSAMIC/snakemake_rules/align/tga_sentieon_alignment.rule @@ -41,3 +41,4 @@ export SENTIEON_LICENSE={params.sentieon_lic}; rm -rf {params.tmpdir} """ + diff --git a/BALSAMIC/snakemake_rules/align/wgs_sentieon_alignment.rule b/BALSAMIC/snakemake_rules/align/wgs_sentieon_alignment.rule index 8e489cf1d..4a49d209b 100644 --- a/BALSAMIC/snakemake_rules/align/wgs_sentieon_alignment.rule +++ b/BALSAMIC/snakemake_rules/align/wgs_sentieon_alignment.rule @@ -15,7 +15,7 @@ rule sentieon_align_sort: sentieon_exec = config_model.sentieon.sentieon_exec, sentieon_lic = config_model.sentieon.sentieon_license, sample_id = "{sample}", - sample_type = "{sample_type}", + sample_type = lambda wildcards: config_model.get_sample_type_by_name(wildcards.sample, uppercase=True), fastq_pattern = "{fastq_pattern}" threads: get_threads(cluster_config, 'sentieon_align_sort') diff --git a/BALSAMIC/snakemake_rules/annotation/rankscore.rule b/BALSAMIC/snakemake_rules/annotation/rankscore.rule index 73b8e2b41..31850ffd1 100644 --- a/BALSAMIC/snakemake_rules/annotation/rankscore.rule +++ b/BALSAMIC/snakemake_rules/annotation/rankscore.rule @@ -3,27 +3,28 @@ # Rank variants according to a rankscore model -rule genmod_score_vardict: +rule genmod_score_snvs: input: - vcf = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.filtered.pass.vcf.gz", + vcf = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.clinical.vcf.gz", rank_score = config["reference"]["rank_score"] output: - vcf_pass = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.filtered.pass.ranked.vcf.gz", + vcf_clinical = vep_dir + "{var_type}.somatic.{case_name}.{var_caller}.clinical.scored.vcf.gz", benchmark: - Path(benchmark_dir, 'genmod_score_vardict_' + "{var_type}.somatic.{case_name}.tsv").as_posix() + Path(benchmark_dir, 'genmod_score_snvs_' + "{var_type}.{var_caller}.somatic.{case_name}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("genmod") + ".sif").as_posix() params: - case_name = "{case_name}" + case_name = "{case_name}", + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "scored"} threads: - get_threads(cluster_config, 'genmod_score_vardict') + get_threads(cluster_config, 'genmod_score_snvs') message: - ("Scoring annotated vardict variants using genmod for {params.case_name}") + ("Scoring annotated SNV variants using genmod for {params.case_name}") shell: """ -genmod score -r -c {input.rank_score} {input.vcf} | \ +genmod score -i {params.case_name} -r -c {input.rank_score} {input.vcf} | \ +bcftools view -o {output.vcf_clinical} -O z; -bcftools view -o {output.vcf_pass} -O z; - -tabix -p vcf -f {output.vcf_pass}; +tabix -p vcf -f {output.vcf_clinical}; """ + diff --git a/BALSAMIC/snakemake_rules/annotation/somatic_snv_annotation.rule b/BALSAMIC/snakemake_rules/annotation/somatic_snv_annotation.rule index 778f64536..bb952e4d7 100644 --- a/BALSAMIC/snakemake_rules/annotation/somatic_snv_annotation.rule +++ b/BALSAMIC/snakemake_rules/annotation/somatic_snv_annotation.rule @@ -83,16 +83,16 @@ tabix -p vcf -f {output.vcf_indel_research} rule vep_annotate_somaticSNV_research: input: vcf_snv_research = vep_dir + "SNV.somatic.{case_name}.{var_caller}.cadd_indel.research.vcf.gz", - header = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.sample_name_map", cosmic = config["reference"]["cosmic"] output: - vcf_snv_research = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz"), + vcf_snv_unfiltered = vep_dir + "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", vcfanno_research_toml = vep_dir + "SNV.somatic.{case_name}.{var_caller}_vcfanno_research.toml" benchmark: Path(benchmark_dir, "vep_somatic_research_snv.{case_name}.{var_caller}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("ensembl-vep") + ".sif").as_posix() params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, message_text = "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", tmp_vcf_research = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.tmp.research.vcf.gz"), vcfanno_research_annotations = dump_toml(research_annotations), @@ -110,7 +110,6 @@ export PERL5LIB=; echo \'{params.vcfanno_research_annotations}\' > {output.vcfanno_research_toml}; vcfanno -p {threads} {output.vcfanno_research_toml} {input.vcf_snv_research} \ -| bcftools reheader --threads {threads} -s {input.header} \ | bcftools view --threads {threads} -O z -o {params.tmp_vcf_research} ; vep \ @@ -118,12 +117,12 @@ vep \ --dir_cache {params.vep_cache} \ --dir_plugins $vep_path \ --input_file {params.tmp_vcf_research} \ ---output_file {output.vcf_snv_research} \ +--output_file {output.vcf_snv_unfiltered} \ --fork {threads} \ {params.vep_defaults} \ --custom {input.cosmic},COSMIC,vcf,exact,0,CDS,GENE,STRAND,CNT,AA ; -tabix -p vcf -f {output.vcf_snv_research}; +tabix -p vcf -f {output.vcf_snv_unfiltered}; rm {params.tmp_vcf_research}; """ @@ -131,9 +130,8 @@ rm {params.tmp_vcf_research}; rule vcfanno_annotate_somaticSNV_clinical: input: vcf_snv_research = vep_dir + "SNV.somatic.{case_name}.{var_caller}.research.vcf.gz", - header = vcf_dir + "SNV.somatic.{case_name}.{var_caller}.sample_name_map", output: - vcf_snv_clinical = temp(vep_dir + "SNV.somatic.{case_name}.{var_caller}.clinical.vcf.gz"), + vcf_snv_clinical = vep_dir + "SNV.somatic.{case_name}.{var_caller}.clinical.vcf.gz", benchmark: Path(benchmark_dir, "vcfanno_somatic_clinical_snv.{case_name}.{var_caller}.tsv").as_posix() singularity: @@ -154,7 +152,6 @@ rule vcfanno_annotate_somaticSNV_clinical: if [[ -f "{params.clinical_snv}" || -f "{params.cancer_germline_snv}" || -f "{params.cancer_somatic_snv}" ]]; then echo \'{params.vcfanno_clinical_annotations}\' > {params.vcfanno_clinical_toml}; vcfanno -p {threads} {params.vcfanno_clinical_toml} {input.vcf_snv_research} | \ - bcftools reheader --threads {threads} -s {input.header} | \ bcftools view --threads {threads} -O z -o {output.vcf_snv_clinical}; else cp {input.vcf_snv_research} {output.vcf_snv_clinical}; diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule index 2172a5aa9..4db2b261f 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal.rule @@ -3,224 +3,68 @@ # NGS filters for various scenarios -rule bcftools_filter_vardict_research_tumor_normal: +rule bcftools_filter_merged_research_tumor_normal: input: - vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.vcf.gz", + vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.merged.research.vcf.gz", output: - vcf_pass_vardict = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.filtered.pass.vcf.gz", - bcftools_counts_research = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.filtered.pass.stats" + vcf_pass_merged = vep_dir + "{var_type}.somatic.{case_name}.merged.research.filtered.pass.vcf.gz", + bcftools_counts_research = vep_dir + "{var_type}.somatic.{case_name}.merged.research.filtered.pass.stats" benchmark: - Path(benchmark_dir, 'bcftools_filter_vardict_research_tumor_normal_' + "{var_type}.somatic.{case_name}.tsv").as_posix() + Path(benchmark_dir, 'bcftools_filter_merged_research_tumor_normal_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - pop_freq = [VARDICT.pop_freq.tag_value, VARDICT.pop_freq.filter_name], - swegen_freq = [VARDICT.swegen_snv_freq.tag_value, VARDICT.swegen_snv_freq.filter_name], - case_name = '{case_name}', - edit_vcf_script = get_script_path("edit_vcf_info.py"), - variant_caller = "vardict" - threads: - get_threads(cluster_config, 'bcftools_filter_vardict_research_tumor_normal') - message: - "Filtering vardict tumor-normal annotated research variants using bcftools and " - "adding FOUND_IN tags to the output VCF for {params.case_name} " - shell: - """ -bcftools view {input.vcf_snv_research} | \ -bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ -bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools view -f PASS -o {output.vcf_pass_vardict}.temp1 -O z; - -python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_vardict}.temp1 \ ---output_vcf {output.vcf_pass_vardict}.temp2 \ ---variant_caller {params.variant_caller}; - -bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict} - -tabix -p vcf -f {output.vcf_pass_vardict}; - -bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_research}; - -rm {output.vcf_pass_vardict}.temp1; - -rm {output.vcf_pass_vardict}.temp2; - """ - - -rule bcftools_filter_tnhaplotyper_tumor_normal: - input: - vcf = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.research.vcf.gz", - output: - vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.research.filtered.vcf.gz", - vcf_pass_tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.research.filtered.pass.vcf.gz", - benchmark: - Path(benchmark_dir, 'bcftools_filter_tnhaplotyper_tumor_normal' + "{var_type}.somatic.{case_name}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - pop_freq = [COMMON_FILTERS.pop_freq.tag_value, COMMON_FILTERS.pop_freq.filter_name], - case_name = '{case_name}' - threads: - get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_normal') - message: - "Filtering tnhaplotyper tumor-normal annotated variants using bcftools for {params.case_name}" - shell: - """ -bcftools view {input.vcf} \ -| bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' \ -| bcftools view --threads {threads} -o {output.vcf_filtered} -O z; - -tabix -p vcf -f {output.vcf_filtered}; - -bcftools view -f PASS -o {output.vcf_pass_tnhaplotyper} -O z {output.vcf_filtered}; - -tabix -p vcf -f {output.vcf_pass_tnhaplotyper}; - - """ - - -rule bcftools_filter_TNscope_umi_research_tumor_normal: - input: - vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.vcf.gz", - output: - vcf_pass_tnscope_umi = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.filtered.pass.vcf.gz", - bcftools_counts_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.filtered.pass.stats" - benchmark: - Path(benchmark_dir, 'bcftools_filter_TNscope_umi_research_tumor_normal' + "{var_type}.somatic.{case_name}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - pop_freq = [SENTIEON_CALLER.pop_freq_umi.tag_value, SENTIEON_CALLER.pop_freq_umi.filter_name], - swegen_freq = [SENTIEON_CALLER.swegen_snv_freq.tag_value, SENTIEON_CALLER.swegen_snv_freq.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - case_name = '{case_name}', - edit_vcf_script = get_script_path("edit_vcf_info.py"), - variant_caller = "tnscope_umi" - threads: - get_threads(cluster_config, 'bcftools_filter_TNscope_umi_research_tumor_normal') - message: - "Filtering TNscope_umi tumor-normal annotated research variants using bcftools and " - "adding FOUND_IN tags to the output VCF file for {params.case_name} " - shell: - """ -bcftools view --threads {threads} -f PASS,triallelic_site {input.vcf_snv_research} | \ -bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ -bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_tnscope_umi}.temp1 -O z; - -python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ ---output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ ---variant_caller {params.variant_caller}; - -bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} - -tabix -p vcf -f {output.vcf_pass_tnscope_umi}; - -bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_research}; - -rm {output.vcf_pass_tnscope_umi}.temp1; - -rm {output.vcf_pass_tnscope_umi}.temp2; - """ - - -rule bcftools_filter_vardict_clinical_tumor_normal: - input: - vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.vcf.gz", - namemap = vep_dir + "status_to_sample_id_namemap" - output: - vcf_pass_vardict = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.filtered.pass.vcf.gz", - bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.filtered.pass.stats" - benchmark: - Path(benchmark_dir,'bcftools_filter_vardict_clinical_tumor_normal_' + "{var_type}.somatic.{case_name}.tsv").as_posix() - singularity: - Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, - pop_freq = [VARDICT.pop_freq.tag_value, VARDICT.pop_freq.filter_name], - swegen_freq = [VARDICT.swegen_snv_freq.tag_value, VARDICT.swegen_snv_freq.filter_name], - loqusdb_clinical_freq = [VARDICT.loqusdb_clinical_snv_freq.tag_value, VARDICT.loqusdb_clinical_snv_freq.filter_name], - case_name = '{case_name}', - edit_vcf_script = get_script_path("edit_vcf_info.py"), - variant_caller = "vardict" + pop_freq = [SNV_FILTER_SETTINGS.pop_freq.tag_value, SNV_FILTER_SETTINGS.pop_freq.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], + case_name = '{case_name}' threads: - get_threads(cluster_config,'bcftools_filter_vardict_clinical_tumor_normal') + get_threads(cluster_config, 'bcftools_filter_tnscope_research_tumor_normal') message: - "Filtering vardict tumor-normal annotated clinical variants using bcftools and " - "adding FOUND_IN tags to the output VCF for {params.case_name} " + "Filtering TGA tumor-normal merged annotated research variants using bcftools for {params.case_name}" shell: """ -bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} |\ -bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ +bcftools view -f PASS,triallelic_site --threads {threads} {input.vcf_snv_research} |\ +bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS -O z -o {output.vcf_pass_vardict}.temp1; - -python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_vardict}.temp1 \ ---output_vcf {output.vcf_pass_vardict}.temp2 \ ---variant_caller {params.variant_caller}; - -bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict} - -tabix -p vcf -f {output.vcf_pass_vardict}; - -bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_clinical}; +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_merged} -O z; -rm {output.vcf_pass_vardict}.temp1; +tabix -p vcf -f {output.vcf_pass_merged}; -rm {output.vcf_pass_vardict}.temp2; +bcftools +counts {output.vcf_pass_merged} > {output.bcftools_counts_research}; """ -rule bcftools_filter_TNscope_umi_clinical_tumor_normal: +rule bcftools_filter_merged_clinical_tumor_normal: input: - vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.vcf.gz", + vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.merged.clinical.vcf.gz", namemap = vep_dir + "status_to_sample_id_namemap" output: - vcf_pass_tnscope_umi = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.vcf.gz", - bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.stats" + vcf_pass_merged = vep_dir + "{var_type}.somatic.{case_name}.merged.clinical.filtered.pass.vcf.gz", + bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.merged.clinical.filtered.pass.stats" benchmark: - Path(benchmark_dir, 'bcftools_filter_TNscope_umi_clinical_tumor_normal' + "{var_type}.somatic.{case_name}.tsv").as_posix() + Path(benchmark_dir, 'bcftools_filter_merged_clinical_tumor_normal_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - pop_freq = [SENTIEON_CALLER.pop_freq_umi.tag_value, SENTIEON_CALLER.pop_freq_umi.filter_name], - swegen_freq = [SENTIEON_CALLER.swegen_snv_freq.tag_value, SENTIEON_CALLER.swegen_snv_freq.filter_name], - loqusdb_clinical_freq = [SENTIEON_CALLER.loqusdb_clinical_snv_freq.tag_value, SENTIEON_CALLER.loqusdb_clinical_snv_freq.filter_name], + pop_freq = [SNV_FILTER_SETTINGS.pop_freq.tag_value, SNV_FILTER_SETTINGS.pop_freq.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], + loqusdb_clinical_freq = [SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.tag_value, SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.filter_name], housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, case_name = '{case_name}', - edit_vcf_script = get_script_path("edit_vcf_info.py"), - variant_caller = "tnscope_umi" threads: - get_threads(cluster_config, 'bcftools_filter_TNscope_umi_clinical_tumor_normal') + get_threads(cluster_config, 'bcftools_filter_tnscope_clinical_tumor_normal') message: - "Filtering TNscope_umi tumor-normal annotated clinical variants using bcftools and " - "adding FOUND_IN tags to the output VCF file for {params.case_name} " + "Filtering TGA tumor-normal merged annotated clinical variants using bcftools for {params.case_name}" shell: """ -bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} |\ -bcftools view --threads {threads} -f PASS,triallelic_site | \ +bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} | \ +bcftools view -f PASS,triallelic_site --threads {threads} | \ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -O z -o {output.vcf_pass_tnscope_umi}.temp1; - -python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ ---output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ ---variant_caller {params.variant_caller}; - -bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} - -tabix -p vcf -f {output.vcf_pass_tnscope_umi}; - -bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_clinical}; +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -O z -o {output.vcf_pass_merged}; -rm {output.vcf_pass_tnscope_umi}.temp1; +tabix -p vcf -f {output.vcf_pass_merged}; -rm {output.vcf_pass_tnscope_umi}.temp2; +bcftools +counts {output.vcf_pass_merged} > {output.bcftools_counts_clinical}; """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal_umi.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal_umi.rule new file mode 100644 index 000000000..7366be829 --- /dev/null +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal_umi.rule @@ -0,0 +1,98 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 +# NGS filters for various scenarios + + +rule bcftools_filter_TNscope_umi_research_tumor_normal: + input: + vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.vcf.gz", + output: + vcf_pass_tnscope_umi = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.filtered.pass.vcf.gz", + bcftools_counts_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.filtered.pass.stats" + benchmark: + Path(benchmark_dir, 'bcftools_filter_TNscope_umi_research_tumor_normal' + "{var_type}.somatic.{case_name}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + pop_freq = [SNV_FILTER_SETTINGS.pop_freq_umi.tag_value, SNV_FILTER_SETTINGS.pop_freq_umi.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], + case_name = '{case_name}', + edit_vcf_script = get_script_path("edit_vcf_info.py"), + variant_caller = "tnscope_umi" + threads: + get_threads(cluster_config, 'bcftools_filter_TNscope_umi_research_tumor_normal') + message: + "Filtering TNscope_umi tumor-normal annotated research variants using bcftools and " + "adding FOUND_IN tags to the output VCF file for {params.case_name} " + shell: + """ +bcftools view --threads {threads} -f PASS,triallelic_site {input.vcf_snv_research} | \ +bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ +bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_tnscope_umi}.temp1 -O z; + +python {params.edit_vcf_script} \ +--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ +--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ +--variant_caller {params.variant_caller}; + +bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} + +tabix -p vcf -f {output.vcf_pass_tnscope_umi}; + +bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_research}; + +rm {output.vcf_pass_tnscope_umi}.temp1; + +rm {output.vcf_pass_tnscope_umi}.temp2; + """ + + +rule bcftools_filter_TNscope_umi_clinical_tumor_normal: + input: + vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.vcf.gz", + namemap = vep_dir + "status_to_sample_id_namemap" + output: + vcf_pass_tnscope_umi = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.vcf.gz", + bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.stats" + benchmark: + Path(benchmark_dir, 'bcftools_filter_TNscope_umi_clinical_tumor_normal' + "{var_type}.somatic.{case_name}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + pop_freq = [SNV_FILTER_SETTINGS.pop_freq_umi.tag_value, SNV_FILTER_SETTINGS.pop_freq_umi.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], + loqusdb_clinical_freq = [SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.tag_value, SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.filter_name], + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, + case_name = '{case_name}', + edit_vcf_script = get_script_path("edit_vcf_info.py"), + variant_caller = "tnscope_umi" + threads: + get_threads(cluster_config, 'bcftools_filter_TNscope_umi_clinical_tumor_normal') + message: + "Filtering TNscope_umi tumor-normal annotated clinical variants using bcftools and " + "adding FOUND_IN tags to the output VCF file for {params.case_name} " + shell: + """ +bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} |\ +bcftools view --threads {threads} -f PASS,triallelic_site | \ +bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ +bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ +bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -O z -o {output.vcf_pass_tnscope_umi}.temp1; + +python {params.edit_vcf_script} \ +--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ +--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ +--variant_caller {params.variant_caller}; + +bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} + +tabix -p vcf -f {output.vcf_pass_tnscope_umi}; + +bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_clinical}; + +rm {output.vcf_pass_tnscope_umi}.temp1; + +rm {output.vcf_pass_tnscope_umi}.temp2; + """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal_wgs.rule similarity index 85% rename from BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule rename to BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal_wgs.rule index 1c4f1fd85..29c364f9f 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_normal.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_normal_wgs.rule @@ -14,9 +14,8 @@ rule bcftools_filter_tnscope_research_tumor_normal: singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - pop_freq = [SENTIEON_CALLER.pop_freq.tag_value, SENTIEON_CALLER.pop_freq.filter_name], - swegen_freq = [SENTIEON_CALLER.swegen_snv_freq.tag_value, SENTIEON_CALLER.swegen_snv_freq.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, + pop_freq = [SNV_FILTER_SETTINGS.pop_freq.tag_value, SNV_FILTER_SETTINGS.pop_freq.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnscope_research_tumor_normal') @@ -47,9 +46,9 @@ rule bcftools_filter_tnscope_clinical_tumor_normal: singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - pop_freq = [SENTIEON_CALLER.pop_freq.tag_value, SENTIEON_CALLER.pop_freq.filter_name], - swegen_freq = [SENTIEON_CALLER.swegen_snv_freq.tag_value, SENTIEON_CALLER.swegen_snv_freq.filter_name], - loqusdb_clinical_freq = [SENTIEON_CALLER.loqusdb_clinical_snv_freq.tag_value, SENTIEON_CALLER.loqusdb_clinical_snv_freq.filter_name], + pop_freq = [SNV_FILTER_SETTINGS.pop_freq.tag_value, SNV_FILTER_SETTINGS.pop_freq.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], + loqusdb_clinical_freq = [SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.tag_value, SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.filter_name], housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, case_name = '{case_name}', threads: diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule index 52c66d584..17ff197ac 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only.rule @@ -3,225 +3,72 @@ # NGS filters for various scenarios -rule bcftools_filter_vardict_research_tumor_only: +rule bcftools_filter_merged_research_tumor_only: input: - vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.vcf.gz", + vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.merged.research.vcf.gz", output: - vcf_pass_vardict = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.filtered.pass.vcf.gz", - bcftools_counts_research = vep_dir + "{var_type}.somatic.{case_name}.vardict.research.filtered.pass.stats" + vcf_pass_merged = vep_dir + "{var_type}.somatic.{case_name}.merged.research.filtered.pass.vcf.gz", + bcftools_counts_research = vep_dir + "{var_type}.somatic.{case_name}.merged.research.filtered.pass.stats" benchmark: - Path(benchmark_dir, 'bcftools_filter_vardict_research_tumor_only_' + "{var_type}.somatic.{case_name}.tsv").as_posix() + Path(benchmark_dir, 'bcftools_filter_merged_research_tumor_only_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - pop_freq = [VARDICT.pop_freq.tag_value, VARDICT.pop_freq.filter_name], - swegen_freq = [VARDICT.swegen_snv_freq.tag_value, VARDICT.swegen_snv_freq.filter_name], - case_name = '{case_name}', - edit_vcf_script = get_script_path("edit_vcf_info.py"), - variant_caller = "vardict" + pop_freq = [SNV_FILTER_SETTINGS.pop_freq.tag_value, SNV_FILTER_SETTINGS.pop_freq.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], + case_name = '{case_name}' threads: - get_threads(cluster_config, 'bcftools_filter_vardict_research_tumor_only') + get_threads(cluster_config, 'bcftools_filter_tnscope_research_tumor_only') message: - "Filtering vardict tumor-only annotated research variants using bcftools and " - "adding FOUND_IN tags to the output VCF for {params.case_name}" + "Filtering TGA tumor-only merged annotated research variants using bcftools for {params.case_name}" shell: """ -bcftools view {input.vcf_snv_research} | \ -bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ -bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp1 -O z; - -python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_vardict}.temp1 \ ---output_vcf {output.vcf_pass_vardict}.temp2 \ ---variant_caller {params.variant_caller}; - -bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict} - -tabix -p vcf -f {output.vcf_pass_vardict}; - -bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_research}; - -rm {output.vcf_pass_vardict}.temp1; - -rm {output.vcf_pass_vardict}.temp2; - """ - - -rule bcftools_filter_tnhaplotyper_tumor_only: - input: - vcf = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.research.vcf.gz", - output: - vcf_filtered = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.research.filtered.vcf.gz", - vcf_pass_tnhaplotyper = vep_dir + "{var_type}.somatic.{case_name}.tnhaplotyper.research.filtered.pass.vcf.gz", - benchmark: - Path(benchmark_dir, 'bcftools_filter_tnhaplotyper_tumor_only' + "{var_type}.somatic.{case_name}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - pop_freq = [COMMON_FILTERS.pop_freq.tag_value, COMMON_FILTERS.pop_freq.filter_name], - case_name = '{case_name}' - threads: - get_threads(cluster_config, 'bcftools_filter_tnhaplotyper_tumor_only') - message: - "Filtering tnhaplotyper tumor-only annotated variants using bcftools for {params.case_name}" - shell: - """ -bcftools view {input.vcf} \ -| bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' \ -| bcftools view --threads {threads} -o {output.vcf_filtered} -O z; - -tabix -p vcf -f {output.vcf_filtered}; - -bcftools view -f PASS -o {output.vcf_pass_tnhaplotyper} -O z {output.vcf_filtered}; - -tabix -p vcf -f {output.vcf_pass_tnhaplotyper}; - - """ - -rule bcftools_filter_TNscope_umi_research_tumor_only: - input: - vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.vcf.gz", - output: - vcf_pass_tnscope_umi = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.filtered.pass.vcf.gz", - bcftools_counts_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.filtered.pass.stats" - benchmark: - Path(benchmark_dir, 'bcftools_filter_TNscope_umi_research_tumor_only' + "{var_type}.somatic.{case_name}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - pop_freq = [SENTIEON_CALLER.pop_freq_umi.tag_value, SENTIEON_CALLER.pop_freq_umi.filter_name], - swegen_freq = [SENTIEON_CALLER.swegen_snv_freq.tag_value, SENTIEON_CALLER.swegen_snv_freq.filter_name], - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - case_name = '{case_name}', - edit_vcf_script = get_script_path("edit_vcf_info.py"), - variant_caller = "tnscope_umi" - threads: - get_threads(cluster_config, 'bcftools_filter_tnscope_umi_research_tumor_only') - message: - "Filtering tnscope_umi tumor-only annotated research variants using bcftools and " - "adding FOUND_IN tags to the output VCF for {params.case_name}" - shell: - """ -bcftools view --threads {threads} -f PASS,triallelic_site {input.vcf_snv_research} | \ -bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ +bcftools view -f PASS,triallelic_site --threads {threads} {input.vcf_snv_research} | \ +bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_tnscope_umi}.temp1 -O z; - -python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ ---output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ ---variant_caller {params.variant_caller}; +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_merged} -O z; -bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} +tabix -p vcf -f {output.vcf_pass_merged}; -tabix -p vcf -f {output.vcf_pass_tnscope_umi}; - -bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_research}; - -rm {output.vcf_pass_tnscope_umi}.temp1; - -rm {output.vcf_pass_tnscope_umi}.temp2; +bcftools +counts {output.vcf_pass_merged} > {output.bcftools_counts_research}; """ - -rule bcftools_filter_vardict_clinical_tumor_only: +rule bcftools_filter_merged_clinical_tumor_only: input: - vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.vcf.gz", + vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.merged.clinical.vcf.gz", namemap = vep_dir + "status_to_sample_id_namemap" output: - vcf_pass_vardict = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.filtered.pass.vcf.gz", - bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.vardict.clinical.filtered.pass.stats" + vcf_pass_merged = vep_dir + "{var_type}.somatic.{case_name}.merged.clinical.filtered.pass.vcf.gz", + bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.merged.clinical.filtered.pass.stats" benchmark: - Path(benchmark_dir, 'bcftools_filter_vardict_clinical_tumor_only_' + "{var_type}.somatic.{case_name}.tsv").as_posix() + Path(benchmark_dir, 'bcftools_filter_merged_clinical_tumor_only_' + "{var_type}.somatic.{case_name}.tsv").as_posix() singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "clinical"}, - pop_freq = [VARDICT.pop_freq.tag_value, VARDICT.pop_freq.filter_name], - swegen_freq = [VARDICT.swegen_snv_freq.tag_value, VARDICT.swegen_snv_freq.filter_name], - loqusdb_clinical_freq = [VARDICT.loqusdb_clinical_snv_freq.tag_value, VARDICT.loqusdb_clinical_snv_freq.filter_name], - case_name = '{case_name}', - edit_vcf_script = get_script_path("edit_vcf_info.py"), - variant_caller = "vardict" + pop_freq = [SNV_FILTER_SETTINGS.pop_freq.tag_value, SNV_FILTER_SETTINGS.pop_freq.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], + loqusdb_clinical_freq = [SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.tag_value, SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.filter_name], + housekeeper_id= {"id": config["analysis"]["case_id"],"tags": "clinical"}, + case_name = '{case_name}' threads: - get_threads(cluster_config, 'bcftools_filter_vardict_clinical_tumor_only') + get_threads(cluster_config, 'bcftools_filter_tnscope_clinical_tumor_only') message: - "Filtering vardict tumor-only annotated clinical variants using bcftools and " - "adding FOUND_IN tags to the output VCF for {params.case_name}" + "Filtering TGA tumor-only merged annotated clinical variants using bcftools for {params.case_name}" shell: """ -bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} |\ -bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ -bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -f PASS -o {output.vcf_pass_vardict}.temp1 -O z; - - -python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_vardict}.temp1 \ ---output_vcf {output.vcf_pass_vardict}.temp2 \ ---variant_caller {params.variant_caller}; - -bgzip -@ {threads} -l 9 -c {output.vcf_pass_vardict}.temp2 > {output.vcf_pass_vardict} - -tabix -p vcf -f {output.vcf_pass_vardict}; - -bcftools +counts {output.vcf_pass_vardict} > {output.bcftools_counts_clinical}; - -rm {output.vcf_pass_vardict}.temp1; - -rm {output.vcf_pass_vardict}.temp2; - """ - -rule bcftools_filter_TNscope_umi_clinical_tumor_only: - input: - vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.vcf.gz", - namemap = vep_dir + "status_to_sample_id_namemap" - output: - vcf_pass_tnscope_umi = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.vcf.gz", - bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.stats" - benchmark: - Path(benchmark_dir,'bcftools_filter_TNscope_umi_clinical_tumor_only' + "{var_type}.somatic.{case_name}.tsv").as_posix() - singularity: - Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - pop_freq=[SENTIEON_CALLER.pop_freq_umi.tag_value, SENTIEON_CALLER.pop_freq_umi.filter_name], - swegen_freq = [SENTIEON_CALLER.swegen_snv_freq.tag_value, SENTIEON_CALLER.swegen_snv_freq.filter_name], - loqusdb_clinical_freq = [SENTIEON_CALLER.loqusdb_clinical_snv_freq.tag_value, SENTIEON_CALLER.loqusdb_clinical_snv_freq.filter_name], - housekeeper_id={"id": config["analysis"]["case_id"], "tags": "clinical"}, - case_name='{case_name}', - edit_vcf_script=get_script_path("edit_vcf_info.py"), - variant_caller="tnscope_umi" - threads: - get_threads(cluster_config,'bcftools_filter_tnscope_umi_tumor_only') - message: - "Filtering tnscope_umi tumor-only annotated clinical variants using bcftools and " - "adding FOUND_IN tags to the output VCF for {params.case_name}" - shell: - """ -bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} |\ -bcftools view --threads {threads} -f PASS,triallelic_site | \ +bcftools view {input.vcf_snv_clinical} | \ +bcftools reheader --threads {threads} -s {input.namemap} | \ +bcftools view -f PASS,triallelic_site --threads {threads} | \ bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ -bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ -bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ -bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -O z -o {output.vcf_pass_tnscope_umi}.temp1; +bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' | \ +bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' | \ +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_merged} -O z; -python {params.edit_vcf_script} \ ---input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ ---output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ ---variant_caller {params.variant_caller}; +tabix -p vcf -f {output.vcf_pass_merged}; -bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} - -tabix -p vcf -f {output.vcf_pass_tnscope_umi}; - -bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_clinical}; +bcftools +counts {output.vcf_pass_merged} > {output.bcftools_counts_clinical}; + """ -rm {output.vcf_pass_tnscope_umi}.temp1; -rm {output.vcf_pass_tnscope_umi}.temp2; - """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only_umi.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only_umi.rule new file mode 100644 index 000000000..47c68e84c --- /dev/null +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only_umi.rule @@ -0,0 +1,98 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 +# NGS filters for various scenarios + + +rule bcftools_filter_TNscope_umi_research_tumor_only: + input: + vcf_snv_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.vcf.gz", + output: + vcf_pass_tnscope_umi = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.filtered.pass.vcf.gz", + bcftools_counts_research = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.research.filtered.pass.stats" + benchmark: + Path(benchmark_dir, 'bcftools_filter_TNscope_umi_research_tumor_only' + "{var_type}.somatic.{case_name}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + pop_freq = [SNV_FILTER_SETTINGS.pop_freq_umi.tag_value, SNV_FILTER_SETTINGS.pop_freq_umi.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], + case_name = '{case_name}', + edit_vcf_script = get_script_path("edit_vcf_info.py"), + variant_caller = "tnscope_umi" + threads: + get_threads(cluster_config, 'bcftools_filter_tnscope_umi_research_tumor_only') + message: + "Filtering tnscope_umi tumor-only annotated research variants using bcftools and " + "adding FOUND_IN tags to the output VCF for {params.case_name}" + shell: + """ +bcftools view --threads {threads} -f PASS,triallelic_site {input.vcf_snv_research} | \ +bcftools filter --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ +bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_pass_tnscope_umi}.temp1 -O z; + +python {params.edit_vcf_script} \ +--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ +--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ +--variant_caller {params.variant_caller}; + +bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} + +tabix -p vcf -f {output.vcf_pass_tnscope_umi}; + +bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_research}; + +rm {output.vcf_pass_tnscope_umi}.temp1; + +rm {output.vcf_pass_tnscope_umi}.temp2; + """ + + +rule bcftools_filter_TNscope_umi_clinical_tumor_only: + input: + vcf_snv_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.vcf.gz", + namemap = vep_dir + "status_to_sample_id_namemap" + output: + vcf_pass_tnscope_umi = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.vcf.gz", + bcftools_counts_clinical = vep_dir + "{var_type}.somatic.{case_name}.tnscope_umi.clinical.filtered.pass.stats" + benchmark: + Path(benchmark_dir,'bcftools_filter_TNscope_umi_clinical_tumor_only' + "{var_type}.somatic.{case_name}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + pop_freq=[SNV_FILTER_SETTINGS.pop_freq_umi.tag_value, SNV_FILTER_SETTINGS.pop_freq_umi.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], + loqusdb_clinical_freq = [SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.tag_value, SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.filter_name], + housekeeper_id={"id": config["analysis"]["case_id"], "tags": "clinical"}, + case_name='{case_name}', + edit_vcf_script=get_script_path("edit_vcf_info.py"), + variant_caller="tnscope_umi" + threads: + get_threads(cluster_config,'bcftools_filter_tnscope_umi_tumor_only') + message: + "Filtering tnscope_umi tumor-only annotated clinical variants using bcftools and " + "adding FOUND_IN tags to the output VCF for {params.case_name}" + shell: + """ +bcftools reheader --threads {threads} -s {input.namemap} {input.vcf_snv_clinical} |\ +bcftools view --threads {threads} -f PASS,triallelic_site | \ +bcftools filter --threads {threads} --include 'INFO/GNOMADAF_popmax <= {params.pop_freq[0]} || INFO/GNOMADAF_popmax == \".\"' --soft-filter '{params.pop_freq[1]}' --mode '+' | \ +bcftools filter --threads {threads} --include 'INFO/SWEGENAF <= {params.swegen_freq[0]} || INFO/SWEGENAF == \".\"' --soft-filter '{params.swegen_freq[1]}' --mode '+' |\ +bcftools filter --threads {threads} --include 'INFO/Frq <= {params.loqusdb_clinical_freq[0]} || INFO/Frq == \".\"' --soft-filter '{params.loqusdb_clinical_freq[1]}' --mode '+' |\ +bcftools view --threads {threads} -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -O z -o {output.vcf_pass_tnscope_umi}.temp1; + +python {params.edit_vcf_script} \ +--input_vcf {output.vcf_pass_tnscope_umi}.temp1 \ +--output_vcf {output.vcf_pass_tnscope_umi}.temp2 \ +--variant_caller {params.variant_caller}; + +bgzip -@ {threads} -l 9 -c {output.vcf_pass_tnscope_umi}.temp2 > {output.vcf_pass_tnscope_umi} + +tabix -p vcf -f {output.vcf_pass_tnscope_umi}; + +bcftools +counts {output.vcf_pass_tnscope_umi} > {output.bcftools_counts_clinical}; + +rm {output.vcf_pass_tnscope_umi}.temp1; + +rm {output.vcf_pass_tnscope_umi}.temp2; + """ diff --git a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only_wgs.rule similarity index 86% rename from BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule rename to BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only_wgs.rule index bc9c4e9ed..f9d63ce96 100644 --- a/BALSAMIC/snakemake_rules/annotation/varcaller_wgs_filter_tumor_only.rule +++ b/BALSAMIC/snakemake_rules/annotation/varcaller_filter_tumor_only_wgs.rule @@ -15,9 +15,8 @@ rule bcftools_filter_tnscope_research_tumor_only: singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - pop_freq = [SENTIEON_CALLER.pop_freq.tag_value, SENTIEON_CALLER.pop_freq.filter_name], - swegen_freq = [SENTIEON_CALLER.swegen_snv_freq.tag_value, SENTIEON_CALLER.swegen_snv_freq.filter_name], - housekeeper_id= {"id": config["analysis"]["case_id"],"tags": "research"}, + pop_freq = [SNV_FILTER_SETTINGS.pop_freq.tag_value, SNV_FILTER_SETTINGS.pop_freq.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], case_name = '{case_name}' threads: get_threads(cluster_config, 'bcftools_filter_tnscope_research_tumor_only') @@ -50,9 +49,9 @@ rule bcftools_filter_tnscope_clinical_tumor_only: singularity: Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() params: - pop_freq = [SENTIEON_CALLER.pop_freq.tag_value, SENTIEON_CALLER.pop_freq.filter_name], - swegen_freq = [SENTIEON_CALLER.swegen_snv_freq.tag_value, SENTIEON_CALLER.swegen_snv_freq.filter_name], - loqusdb_clinical_freq = [SENTIEON_CALLER.loqusdb_clinical_snv_freq.tag_value, SENTIEON_CALLER.loqusdb_clinical_snv_freq.filter_name], + pop_freq = [SNV_FILTER_SETTINGS.pop_freq.tag_value, SNV_FILTER_SETTINGS.pop_freq.filter_name], + swegen_freq = [SNV_FILTER_SETTINGS.swegen_snv_freq.tag_value, SNV_FILTER_SETTINGS.swegen_snv_freq.filter_name], + loqusdb_clinical_freq = [SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.tag_value, SNV_FILTER_SETTINGS.loqusdb_clinical_snv_freq.filter_name], housekeeper_id= {"id": config["analysis"]["case_id"],"tags": "clinical"}, case_name = '{case_name}' threads: diff --git a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule index 1541fcea9..03cd40543 100644 --- a/BALSAMIC/snakemake_rules/quality_control/multiqc.rule +++ b/BALSAMIC/snakemake_rules/quality_control/multiqc.rule @@ -4,8 +4,6 @@ # get bamfile multiqc_input = [config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample)] -if config['analysis']['analysis_type'] == "paired": - multiqc_input.append(config_model.get_final_bam_name(bam_dir=bam_dir,sample_name= normal_sample)) # fastqc metrics multiqc_input.extend(expand(fastqc_dir + "{fastq_file_names}_fastqc.zip", fastq_file_names=config_model.get_all_fastq_names(remove_suffix = True))) @@ -24,6 +22,7 @@ qc_metrics_wildcards = ["alignment_summary_metrics.txt", "base_distribution_by_c "samtools.flagstats.txt", "samtools.idxstats.txt", "samtools.stats.txt"] if config["analysis"]["sequencing_type"] == 'targeted': + # for post UMI collapse qc_metrics_wildcards.extend(["umi_collapsed.samtools.flagstats.txt", "umi_collapsed.samtools.idxstats.txt", "umi_collapsed.samtools.stats.txt"]) @@ -105,10 +104,13 @@ else: sample=normal_sample, mosdepth_wc = mosdepth_metrics_wildcard)) if config["analysis"]["analysis_workflow"]=="balsamic-umi": + # UMI cram file + multiqc_input.extend(expand(umi_dir + "tumor.{sample}.consensusfiltered_umi.cram", sample=tumor_sample)) # UMI picard metrics multiqc_input.extend(expand(umi_qc_dir + "tumor.{sample}.umi.collect_hsmetric.txt", sample=tumor_sample)) multiqc_input.extend(expand(umi_qc_dir + "tumor.{sample}.umi.metrics.txt", sample=tumor_sample)) if config['analysis']['analysis_type'] == "paired": + multiqc_input.extend(expand(umi_dir + "normal.{sample}.consensusfiltered_umi.cram", sample=normal_sample)) multiqc_input.extend(expand(umi_qc_dir + "normal.{sample}.umi.collect_hsmetric.txt",sample=normal_sample)) multiqc_input.extend(expand(umi_qc_dir + "normal.{sample}.umi.metrics.txt",sample=normal_sample)) @@ -141,3 +143,4 @@ multiqc --force --outdir {params.qc_dir} \ chmod -R 777 {params.qc_dir}; """ + diff --git a/BALSAMIC/snakemake_rules/quality_control/picard.rule b/BALSAMIC/snakemake_rules/quality_control/picard.rule index 75e527a20..b290c31f5 100644 --- a/BALSAMIC/snakemake_rules/quality_control/picard.rule +++ b/BALSAMIC/snakemake_rules/quality_control/picard.rule @@ -24,7 +24,7 @@ rule picard_CollectHsMetrics: baitsetname = Path(config["panel"]["capture_kit"]).name, sample = "{sample}" threads: - get_threads(cluster_config, "picard_CollectHsMetrics") + get_threads(cluster_config, "picard_qc") message: "Calculating picard HsMetrics for sample {params.sample}" shell: diff --git a/BALSAMIC/snakemake_rules/quality_control/picard_common.rule b/BALSAMIC/snakemake_rules/quality_control/picard_common.rule new file mode 100644 index 000000000..aec655194 --- /dev/null +++ b/BALSAMIC/snakemake_rules/quality_control/picard_common.rule @@ -0,0 +1,39 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +if "canfam3" in config['reference']['reference_genome']: + memory = "20g" +else: + memory = "16g" + +rule picard_CollectAlignmentSummaryMetrics: + input: + bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample), + fa = config["reference"]["reference_genome"] + output: + alignment_metrics = Path(qc_dir,"{sample_type}.{sample}.alignment_summary_metrics.txt").as_posix(), + benchmark: + Path(benchmark_dir, "CollectAlignmentSummaryMetrics.{sample_type}.{sample}.tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("picard") + ".sif").as_posix() + params: + mem = "16g", + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + adapter = config["QC"]["adapter"], + sample = "{sample}" + threads: + get_threads(cluster_config, "picard_qc") + message: + "Calculating picard alignment summary metrics for sample {params.sample}" + shell: + """ +export TMPDIR={params.tmpdir}; +picard -Djava.io.tmpdir={params.tmpdir} -Xmx{params.mem} \ +CollectAlignmentSummaryMetrics \ +R={input.fa} \ +I={input.bam} \ +O={output} \ +ADAPTER_SEQUENCE={params.adapter} \ +METRIC_ACCUMULATION_LEVEL=ALL_READS \ +METRIC_ACCUMULATION_LEVEL=LIBRARY; + """ diff --git a/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule b/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule index 860746bac..d1a3dc988 100644 --- a/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule +++ b/BALSAMIC/snakemake_rules/quality_control/qc_metrics.rule @@ -4,12 +4,12 @@ bcftools_counts_input = [] if config["analysis"]["analysis_workflow"] == "balsamic": - bcftools_counts_input.append(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical.filtered.pass.stats") - - if config["analysis"]["sequencing_type"] == 'wgs': + if config["analysis"]["sequencing_type"] == "wgs": + bcftools_counts_input.append(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical.filtered.pass.stats") bcftools_counts_input.append(vep_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.clinical.filtered.pass.stats") else: - bcftools_counts_input.append(vep_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.clinical.filtered.pass.stats") + bcftools_counts_input.append(vep_dir + "SV.somatic." + config["analysis"]["case_id"] + ".svdb.clinical.filtered.pass.stats") + bcftools_counts_input.append(vep_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".merged.clinical.filtered.pass.stats") if config["analysis"]["analysis_workflow"] == "balsamic-umi": bcftools_counts_input.append(vep_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.clinical.filtered.pass.stats") diff --git a/BALSAMIC/snakemake_rules/quality_control/sentieon_qc_metrics.rule b/BALSAMIC/snakemake_rules/quality_control/sentieon_qc_metrics.rule index 6f64133f5..96df58f60 100644 --- a/BALSAMIC/snakemake_rules/quality_control/sentieon_qc_metrics.rule +++ b/BALSAMIC/snakemake_rules/quality_control/sentieon_qc_metrics.rule @@ -53,7 +53,6 @@ if config["analysis"]["sequencing_type"] == 'wgs': ref = config["reference"]["reference_genome"], bam_files = lambda wildcards: config_model.get_bam_name_per_lane(bam_dir = bam_dir, sample_name = wildcards.sample) output: - alignment_metrics = Path(qc_dir,"{sample_type}.{sample}.alignment_summary_metrics.txt").as_posix(), insert_size_metrics = Path(qc_dir,"{sample_type}.{sample}.insert_size_metrics.txt").as_posix(), gc_bias_metrics = Path(qc_dir,"{sample_type}.{sample}.gc_bias.txt").as_posix(), gc_bias_summary = Path(qc_dir,"{sample_type}.{sample}.gc_bias_summary.txt").as_posix(), @@ -86,7 +85,6 @@ if config["analysis"]["sequencing_type"] == 'wgs': -t {threads} \ -r {input.ref} \ -i $shell_bam_files \ - --algo AlignmentStat --adapter_seq {params.adapter} {output.alignment_metrics} \ --algo InsertSizeMetricAlgo --min_read_ratio {params.min_read_ratio} {output.insert_size_metrics} \ --algo GCBias --summary {output.gc_bias_summary} {output.gc_bias_metrics} \ --algo MeanQualityByCycle {output.quality_by_cycle_metrics} \ @@ -101,7 +99,6 @@ else: ref=config["reference"]["reference_genome"], bam=lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample, specified_suffix="align_sort"), output: - alignment_metrics=Path(qc_dir,"{sample_type}.{sample}.alignment_summary_metrics.txt").as_posix(), insert_size_metrics=Path(qc_dir,"{sample_type}.{sample}.insert_size_metrics.txt").as_posix(), gc_bias_metrics=Path(qc_dir,"{sample_type}.{sample}.gc_bias.txt").as_posix(), gc_bias_summary=Path(qc_dir,"{sample_type}.{sample}.gc_bias_summary.txt").as_posix(), @@ -132,7 +129,6 @@ else: -t {threads} \ -r {input.ref} \ -i {input.bam} \ - --algo AlignmentStat --adapter_seq {params.adapter} {output.alignment_metrics} \ --algo InsertSizeMetricAlgo --min_read_ratio {params.min_read_ratio} {output.insert_size_metrics} \ --algo GCBias --summary {output.gc_bias_summary} {output.gc_bias_metrics} \ --algo MeanQualityByCycle {output.quality_by_cycle_metrics} \ diff --git a/BALSAMIC/snakemake_rules/umi/modify_tnscope_infofield_umi.rule b/BALSAMIC/snakemake_rules/umi/modify_tnscope_infofield_umi.rule new file mode 100644 index 000000000..4e4371c3b --- /dev/null +++ b/BALSAMIC/snakemake_rules/umi/modify_tnscope_infofield_umi.rule @@ -0,0 +1,32 @@ + +rule modify_tnscope_infofield_umi: + input: + vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.preprocess.vcf.gz", + output: + vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", + benchmark: + Path(benchmark_dir,'modify_tnscope_infofield_umi_' + config[ "analysis" ][ "case_id" ] + ".tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, + modify_tnscope_infofield = get_script_path("modify_tnscope_infofield.py"), + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + case_name = config["analysis"]["case_id"] + threads: + get_threads(cluster_config, 'modify_tnscope_infofield') + message: + "Add DP and AF tumor sample info to INFO field for case: {params.case_name}" + shell: + """ +export TMPDIR={params.tmpdir}; +mkdir -p {params.tmpdir}; + +cp {input.vcf_tnscope_umi} {params.tmpdir}/tnscope_umi_preprocess.vcf.gz ; +gunzip {params.tmpdir}/tnscope_umi_preprocess.vcf.gz ; + +python {params.modify_tnscope_infofield} {params.tmpdir}/tnscope_umi_preprocess.vcf {params.tmpdir}/vcf_tnscope_snvs_modified.vcf ; +bgzip {params.tmpdir}/vcf_tnscope_snvs_modified.vcf ; +mv {params.tmpdir}/vcf_tnscope_snvs_modified.vcf.gz {output.vcf_tnscope_umi} ; +tabix -p vcf -f {output.vcf_tnscope_umi} ; + """ diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule b/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule index cccce01ff..7a4b61ad1 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_consensuscall.rule @@ -1,7 +1,6 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 - rule sentieon_consensuscall_umi: """UMI-consensus calling""" input: @@ -42,7 +41,6 @@ export SENTIEON_LICENSE={params.sentieon_lic}; rm -rf {params.tmpdir} """ - rule sentieon_bwa_umiconsensus: """Alignment of consensus reads""" input: @@ -89,13 +87,12 @@ export SENTIEON_LICENSE={params.sentieon_lic}; rm -rf {params.tmpdir} """ - rule sentieon_consensusfilter_umi: """Filter consensus called reads based on 'XZ' filtering""" input: umi_dir + "{sample_type}.{sample}.consensuscalled_umi.bam" output: - temp(umi_dir + "{sample_type}.{sample}.consensusfiltered_umi.bam") + umi_dir + "{sample_type}.{sample}.consensusfiltered_umi.bam" benchmark: Path(benchmark_dir, "sentieon_consensusfilter_umi_{sample_type}_{sample}.tsv").as_posix() singularity: @@ -119,3 +116,50 @@ samtools view -bh - > {output}; samtools index {output}; """ + +rule bam_compress_tumor_umi: + input: + bam = umi_dir + "tumor.{sample}.consensusfiltered_umi.bam", + fasta = config_model.reference["reference_genome"] + output: + cram = umi_dir + "tumor.{sample}.consensusfiltered_umi.cram", + benchmark: + Path(benchmark_dir, "bam_compress_tumor_umi_{sample}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("compress") + ".sif").as_posix() + params: + sample_id = "{sample}", + housekeeper_id= {"id": tumor_sample, "tags": "umi_tumor"} + threads: + get_threads(cluster_config, "bam_compress") + message: + "Compressing UMI bam to cram for {params.sample_id}" + shell: + """ +samtools view -h -T {input.fasta} --threads {threads} -C -o {output.cram} {input.bam}; +samtools index {output.cram}; + """ + +if config['analysis']['analysis_type'] == "paired": + rule bam_compress_normal_umi: + input: + bam = umi_dir + "normal.{sample}.consensusfiltered_umi.bam", + fasta = config_model.reference["reference_genome"] + output: + cram = umi_dir + "normal.{sample}.consensusfiltered_umi.cram" + benchmark: + Path(benchmark_dir, "bam_compress_normal_umi_{sample}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("compress") + ".sif").as_posix() + params: + sample_id = "{sample}", + housekeeper_id= {"id": normal_sample, "tags": "umi_normal"} + threads: + get_threads(cluster_config, "bam_compress") + message: + "Compressing UMI bam to cram for {params.sample_id}" + shell: + """ + samtools view -h -T {input.fasta} --threads {threads} -C -o {output.cram} {input.bam}; + samtools index {output.cram}; + """ diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule index 3c7beb901..73358a446 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope.rule @@ -7,21 +7,20 @@ rule sentieon_tnscope_umi: input: - bamT = config_model.get_final_bam_name(bam_dir=umi_dir, sample_name=tumor_sample, specified_suffix="consensusfiltered_umi"), + bam = expand(umi_dir + "tumor.{sample}.consensusfiltered_umi.bam", sample=tumor_sample), ref_fa = config["reference"]["reference_genome"], bed = config["panel"]["capture_kit"], dbsnp = config["reference"]["dbsnp"] output: - vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.preprocess.vcf.gz", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.sample_name_map" benchmark: Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), sentieon_exec = config_model.sentieon.sentieon_exec, sentieon_lic = config_model.sentieon.sentieon_license, - tumor_af = params.umicommon.filter_tumor_af, + tumor_af = params.tnscope_umi.filter_tumor_af, algo = params.tnscope_umi.algo, disable_detect = params.tnscope_umi.disable_detect, tumor_lod = params.tnscope_umi.min_tumorLOD, @@ -30,7 +29,7 @@ rule sentieon_tnscope_umi: prune_factor = params.tnscope_umi.prunefactor, padding = params.tnscope_umi.padding, tumor = "TUMOR", - pcr_model = params.common.pcr_model + pcr_model = params.tnscope_umi.pcr_model threads: get_threads(cluster_config, "sentieon_tnscope_umi") message: diff --git a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule index df416a69c..1c60a3ba2 100644 --- a/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule +++ b/BALSAMIC/snakemake_rules/umi/sentieon_varcall_tnscope_tn.rule @@ -6,29 +6,28 @@ rule sentieon_tnscope_umi_tn: input: - bamT = config_model.get_final_bam_name(bam_dir=umi_dir, sample_name=tumor_sample, specified_suffix="consensusfiltered_umi"), - bamN = config_model.get_final_bam_name(bam_dir=umi_dir, sample_name=normal_sample, specified_suffix="consensusfiltered_umi"), + bamT = expand(umi_dir + "tumor.{sample}.consensusfiltered_umi.bam", sample=tumor_sample), + bamN = expand(umi_dir + "normal.{sample}.consensusfiltered_umi.bam", sample=normal_sample), ref_fa = config["reference"]["reference_genome"], bed = config["panel"]["capture_kit"], dbsnp = config["reference"]["dbsnp"] output: - vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", + vcf_tnscope_umi = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.preprocess.vcf.gz", namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope_umi.sample_name_map" benchmark: Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, tmpdir = tempfile.mkdtemp(prefix=tmp_dir), sentieon_exec = config_model.sentieon.sentieon_exec, sentieon_lic = config_model.sentieon.sentieon_license, - tumor_af = params.umicommon.filter_tumor_af, + tumor_af = params.tnscope_umi.filter_tumor_af, algo = params.tnscope_umi.algo, disable_detect = params.tnscope_umi.disable_detect, tumor_lod = params.tnscope_umi.min_tumorLOD, init_tumor_lod = params.tnscope_umi.init_tumorLOD, error_rate = params.tnscope_umi.error_rate, prune_factor = params.tnscope_umi.prunefactor, - pcr_model = params.common.pcr_model, + pcr_model = params.tnscope_umi.pcr_model, padding = params.tnscope_umi.padding, tumor = "TUMOR", normal = "NORMAL", diff --git a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_preprocess.rule b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_preprocess.rule index 38dcc5490..55859b5f8 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_preprocess.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_preprocess.rule @@ -31,6 +31,7 @@ rule create_coverage: Path(benchmark_dir, "cnvkit_{sample}.coverage.tsv").as_posix() params: case_name = config["analysis"]["case_id"], + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), min_mapq = params.common.min_mapq, singularity: Path(singularity_image, "cnvkit.sif").as_posix() @@ -40,6 +41,10 @@ rule create_coverage: "Segmenting genomic regions using CNVkit for {params.case_name}" shell: """ +export TMPDIR={params.tmpdir} ; + cnvkit.py coverage {input.bam} {input.target_bed} -o {output.target_cnn} --min-mapq {params.min_mapq} --processes {threads} ; cnvkit.py coverage {input.bam} {input.antitarget_bed} -o {output.antitarget_cnn} --min-mapq {params.min_mapq} --processes {threads} ; + +rm -rf {params.tmpdir} """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/merge_snv_vcfs.rule b/BALSAMIC/snakemake_rules/variant_calling/merge_snv_vcfs.rule new file mode 100644 index 000000000..2026910aa --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/merge_snv_vcfs.rule @@ -0,0 +1,52 @@ + +rule bcftools_normalise_vcfs: + input: + ref = config["reference"]["reference_genome"], + vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".{caller}.research.vcf.gz", + output: + vcf_normalised = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".{caller}.research.normalised.vcf.gz", + benchmark: + Path(benchmark_dir,'bcftools_norm_{caller}' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + case_name = config["analysis"]["case_id"], + variant_caller = "{caller}", + threads: + get_threads(cluster_config,'bcftools_normalise_vcfs') + message: + "Normalising variants for {params.variant_caller} {params.case_name}" + shell: + """ + bcftools norm --output-type u --multiallelics -both --check-ref s --fasta-ref {input.ref} {input.vcf} \ + | bcftools norm -o {output.vcf_normalised} --output-type z --rm-dup none - + + tabix -p vcf -f {output.vcf_normalised}; + """ + +rule bcftools_concatenate_vcfs: + input: + vcfs = expand(vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".{caller}.research.normalised.vcf.gz", caller=["vardict", "tnscope"]) + output: + vcf_merged = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".merged.research.vcf.gz" + benchmark: + Path(benchmark_dir + "bcftools_concatenate_vcfs_" + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + threads: + get_threads(cluster_config, "bcftools_concatenate_vcfs") + message: + "Merging VCFs with bcftools concat" + shell: + """ +export TMPDIR={params.tmpdir}; + +vcf_files=$(echo {input.vcfs} | sed 's/ / -I /g') ; + +bcftools concat -O z --allow-overlaps --rm-dups all {input.vcfs} > {output.vcf_merged} ; +tabix -p vcf -f {output.vcf_merged} ; + +rm -rf {params.tmpdir} + """ \ No newline at end of file diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule deleted file mode 100644 index 5be821298..000000000 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_quality_filter.rule +++ /dev/null @@ -1,202 +0,0 @@ - - -if config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["analysis_type"] == 'single': - rule bcftools_quality_filter_tnscope_tumor_only: - input: - vcf_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", - wgs_calling_file = config["reference"]["wgs_calling_regions"], - output: - vcf_snv_research = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", - benchmark: - Path(benchmark_dir, 'bcftools_quality_filter_tnscope_tumor_only_' + config["analysis"]["case_id"] + ".tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - DP = [SENTIEON_CALLER.DP.tag_value, SENTIEON_CALLER.DP.filter_name], - AD = [SENTIEON_CALLER.AD.tag_value, SENTIEON_CALLER.AD.filter_name], - AF_min = [SENTIEON_CALLER.AF_min.tag_value, SENTIEON_CALLER.AF_min.filter_name], - strand_reads = [SENTIEON_CALLER.strand_reads.tag_value, SENTIEON_CALLER.strand_reads.filter_name], - qss = [SENTIEON_CALLER.qss.tag_value, SENTIEON_CALLER.qss.filter_name], - sor = [SENTIEON_CALLER.sor.tag_value, SENTIEON_CALLER.sor.filter_name], - case_name = config["analysis"]["case_id"], - threads: - get_threads(cluster_config, 'bcftools_quality_filter_tnscope_tumor_only') - message: - "Quality filtering WGS tumor-only tnscope variants using bcftools for {params.case_name}" - shell: - """ -export TMPDIR={params.tmpdir}; - -grep -v '^@' {input.wgs_calling_file} > {input.wgs_calling_file}.bed - -bcftools view -f PASS,triallelic_site --threads {threads} --regions-file {input.wgs_calling_file}.bed {input.vcf_snv} \ -| bcftools filter --threads {threads} --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \ -| bcftools filter --threads {threads} --include 'FORMAT/AD[0:1] > {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \ -| bcftools filter --threads {threads} --include 'FORMAT/AF > {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \ -| bcftools filter --threads {threads} --include 'SUM(FORMAT/QSS)/SUM(FORMAT/AD) >= {params.qss[0]}' --soft-filter '{params.qss[1]}' --mode '+' \ -| bcftools filter --threads {threads} --include 'FORMAT/ALT_F1R2 > {params.strand_reads[0]} && (FORMAT/ALT_F1R2 > 0 && FORMAT/ALT_F2R1 > {params.strand_reads[0]} && FORMAT/REF_F1R2 > {params.strand_reads[0]} && FORMAT/REF_F2R1 > {params.strand_reads[0]})' --soft-filter '{params.strand_reads[1]}' --mode '+' \ -| bcftools filter --threads {threads} --include "INFO/SOR < {params.sor[0]}" --soft-filter '{params.sor[1]}' --mode '+' \ -| bcftools view -f PASS,triallelic_site -O z -o {output.vcf_snv_research}; - -tabix -p vcf -f {output.vcf_snv_research}; - -rm -rf {params.tmpdir}; - """ - -elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["analysis_type"] == 'paired': - rule bcftools_quality_filter_tnscope_tumor_normal: - input: - vcf_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", - wgs_calling_file = config["reference"]["wgs_calling_regions"], - output: - vcf_snv_research = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", - benchmark: - Path(benchmark_dir, 'bcftools_quality_filter_tnscope_tumor_normal_' + config["analysis"]["case_id"] + ".tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - AD = [SENTIEON_CALLER.AD.tag_value, SENTIEON_CALLER.AD.filter_name], - DP = [SENTIEON_CALLER.DP.tag_value, SENTIEON_CALLER.DP.filter_name], - AF_min = [SENTIEON_CALLER.AF_min.tag_value, SENTIEON_CALLER.AF_min.filter_name], - high_normal_tumor_af_frac_filter_name=SENTIEON_CALLER.high_normal_tumor_af_frac.filter_name, - high_normal_tumor_af_frac_value=SENTIEON_CALLER.high_normal_tumor_af_frac.tag_value, - case_name = config["analysis"]["case_id"], - threads: - get_threads(cluster_config, 'bcftools_quality_filter_tnscope_tumor_normal') - message: - "Quality filtering WGS tumor-normal tnscope variants using bcftools for {params.case_name}" - shell: - """ -bcftools view {input.vcf_snv} \ -| bcftools filter --threads {threads} --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]} || SUM(FORMAT/AD[1:0]+FORMAT/AD[1:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \ -| bcftools filter --threads {threads} --include 'FORMAT/AD[0:1] >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \ -| bcftools filter --threads {threads} --include 'FORMAT/AF[0] >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \ -| bcftools annotate -x FILTER/alt_allele_in_normal \ -| bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \ -| bcftools view -f PASS,triallelic_site -O z -o {output.vcf_snv_research}; - -tabix -p vcf -f {output.vcf_snv_research}; - """ - -if config["analysis"]["sequencing_type"] == 'targeted' and config["analysis"]["analysis_type"] == 'single': - - rule bcftools_quality_filter_vardict_tumor_only: - input: - vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", - output: - vcf_filtered = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.research.vcf.gz", - benchmark: - Path(benchmark_dir,'bcftools_quality_filter_vardict_tumor_only_' + config["analysis"]["case_id"] + ".tsv").as_posix() - singularity: - Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - MQ=[VARDICT.MQ.tag_value, VARDICT.MQ.filter_name], - AD=[VARDICT.AD.tag_value, VARDICT.AD.filter_name], - DP=[VARDICT.DP.tag_value, VARDICT.DP.filter_name], - AF_min=[VARDICT.AF_min.tag_value, VARDICT.AF_min.filter_name], - case_name = config["analysis"]["case_id"], - threads: - get_threads(cluster_config,'bcftools_quality_filter_vardict_tumor_only') - message: - "Quality filtering vardict tumor-only annotated variants using bcftools for {params.case_name}" - shell: - """ -bcftools view -f PASS {input.vcf} | \ -bcftools filter --include 'INFO/MQ >= {params.MQ[0]}' --soft-filter '{params.MQ[1]}' --mode '+' | \ -bcftools filter --include 'INFO/DP >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' | \ -bcftools filter --include 'INFO/VD >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' | \ -bcftools filter --include 'INFO/AF >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' | \ -bcftools view -f PASS -o {output.vcf_filtered} -O z; - -tabix -p vcf -f {output.vcf_filtered}; - """ - - - rule bcftools_quality_filter_TNscope_umi_tumor_only: - input: - vcf= vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", - output: - vcf_filtered = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.research.vcf.gz" - benchmark: - Path(benchmark_dir,'bcftools_quality_filter_TNscope_umi_tumor_only' + config["analysis"]["case_id"] + ".tsv").as_posix() - singularity: - Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = config["analysis"]["case_id"], - threads: - get_threads(cluster_config,'bcftools_quality_filter_tnscope_umi_tumor_only') - message: - "Quality filtering tnscope_umi tumor-only variants using bcftools for {params.case_name}" - shell: - """ -bcftools view {input.vcf} | \ -bcftools view -f PASS,triallelic_site -o {output.vcf_filtered} -O z; - -tabix -p vcf -f {output.vcf_filtered}; - """ - - -elif config["analysis"]["sequencing_type"] == 'targeted' and config["analysis"]["analysis_type"] == 'paired': - - rule bcftools_quality_filter_vardict_tumor_normal: - input: - vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", - output: - vcf_filtered = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.research.vcf.gz", - benchmark: - Path(benchmark_dir,'bcftools_quality_filter_vardict_tumor_normal_' + config["analysis"]["case_id"] + ".tsv").as_posix() - singularity: - Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - MQ=[VARDICT.MQ.tag_value, VARDICT.MQ.filter_name], - AD=[VARDICT.AD.tag_value, VARDICT.AD.filter_name], - DP=[VARDICT.DP.tag_value, VARDICT.DP.filter_name], - AF_min=[VARDICT.AF_min.tag_value, VARDICT.AF_min.filter_name], - possible_germline="balsamic_possible_germline", - case_name = config["analysis"]["case_id"], - threads: - get_threads(cluster_config,'bcftools_quality_filter_vardict_tumor_normal') - message: - "Quality filtering vardict tumor-normal variants using bcftools for {params.case_name} " - shell: - """ -bcftools view {input.vcf} | \ -bcftools filter --include 'SMPL_MIN(FMT/MQ) >= {params.MQ[0]}' --soft-filter '{params.MQ[1]}' --mode + | \ -bcftools filter --include 'INFO/DP >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' | \ -bcftools filter --include 'INFO/VD >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' | \ -bcftools filter --include 'INFO/AF >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' | \ -bcftools filter --exclude 'INFO/STATUS ~ "germline/i"' --soft-filter '{params.possible_germline}' --mode '+' | \ -bcftools view -f PASS -o {output.vcf_filtered} -O z; - -tabix -p vcf -f {output.vcf_filtered}; - """ - - - rule bcftools_quality_filter_TNscope_umi_tumor_normal: - input: - vcf = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", - output: - vcf_filtered = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.research.vcf.gz", - benchmark: - Path(benchmark_dir,'bcftools_quality_filter_TNscope_umi_tumor_normal' + config["analysis"]["case_id"] + ".tsv").as_posix() - singularity: - Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - case_name = config["analysis"]["case_id"], - high_normal_tumor_af_frac_filter_name=SENTIEON_CALLER.high_normal_tumor_af_frac.filter_name, - high_normal_tumor_af_frac_value=SENTIEON_CALLER.high_normal_tumor_af_frac.tag_value, - threads: - get_threads(cluster_config,'bcftools_quality_filter_TNscope_umi_tumor_normal') - message: - "Quality filtering TNscope_umi tumor-normal annotated variants using bcftools for {params.case_name} " - shell: - """ -bcftools view {input.vcf} \ -| bcftools annotate -x FILTER/alt_allele_in_normal \ -| bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \ -| bcftools view -f PASS,triallelic_site -o {output.vcf_filtered} -O z; - -tabix -p vcf -f {output.vcf_filtered}; - """ - diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule deleted file mode 100644 index 882e5ff6a..000000000 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_split_snv_sv.rule +++ /dev/null @@ -1,35 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - - - -rule bcftools_view_split_variant: - input: - ref = config["reference"]["reference_genome"], - vcf = vcf_dir + "sentieon_tnscope/ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", - output: - vcf_tnscope = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", - vcf_tnscope_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", - benchmark: - Path(benchmark_dir,'bcftools_view_split_variant_' + config[ "analysis" ][ "case_id" ] + ".tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - case_name = config["analysis"]["case_id"] - threads: - get_threads(cluster_config, 'bcftools_view_split_variant') - message: - "Split tnscope snv and sv variants using bcftools for {params.case_name}" - shell: - """ -export TMPDIR={params.tmpdir}; -mkdir -p {params.tmpdir}; - -bcftools view --include 'INFO/SVTYPE=="."' -O z -o {output.vcf_tnscope} {input.vcf}; -tabix -p vcf -f {output.vcf_tnscope}; - -bcftools view --include 'INFO/SVTYPE!="."' -O z -o {output.vcf_tnscope_sv} {input.vcf}; -tabix -p vcf -f {output.vcf_tnscope_sv}; - """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall_wgs.rule similarity index 76% rename from BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule rename to BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall_wgs.rule index d2f0f5a50..f46a16c9b 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_t_varcall_wgs.rule @@ -1,15 +1,8 @@ # vim: syntax=python tabstop=4 expandtab # coding: utf-8 -def get_pon(config): - """ return pon cli string, complete with file """ - if "PON" in config["analysis"]: - return os.path.abspth(config["analysis"]["PON"]) - else: - return None - -rule sentieon_TNscope_tumor_only: +rule sentieon_tnscope_wgs_tumor_only: input: ref = config["reference"]["reference_genome"], dbsnp = config["reference"]["dbsnp"], @@ -26,11 +19,10 @@ rule sentieon_TNscope_tumor_only: tmpdir = tempfile.mkdtemp(prefix=tmp_dir), tumor = "TUMOR", tumor_options = VARCALL_PARAMS["tnscope"]["tumor"], - pon = " " if get_pon(config) is None else " ".join(["--pon", get_pon(config)]), pcr_model = params.common.pcr_model, - sentieon_exec = config_model.sentieon.sentieon_exec, - sentieon_lic = config_model.sentieon.sentieon_license, - case_name = config["analysis"]["case_id"] + sentieon_exec=config_model.sentieon.sentieon_exec, + sentieon_lic=config_model.sentieon.sentieon_license, + case_name = config["analysis"]["case_id"] threads: get_threads(cluster_config, 'sentieon_TNscope_tumor_only') message: @@ -48,7 +40,7 @@ export SENTIEON_LICENSE={params.sentieon_lic}; -q {input.recal} \ -t {threads} \ --algo TNscope \ ---tumor_sample {params.tumor} {params.pon} \ +--tumor_sample {params.tumor} \ --dbsnp {input.dbsnp} \ --cosmic {input.cosmic} \ --pcr_indel_mode {params.pcr_model} \ diff --git a/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule b/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall_wgs.rule similarity index 77% rename from BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule rename to BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall_wgs.rule index ec249a8bf..e8e1a2d46 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/sentieon_tn_varcall_wgs.rule @@ -2,7 +2,7 @@ # coding: utf-8 -rule sentieon_TNscope: +rule sentieon_tnscope_wgs_tumor_normal: input: ref = config["reference"]["reference_genome"], dbsnp = config["reference"]["dbsnp"], @@ -24,23 +24,19 @@ rule sentieon_TNscope: pcr_model = params.common.pcr_model, tumor_options = VARCALL_PARAMS["tnscope"]["tumor"], normal_options = VARCALL_PARAMS["tnscope"]["normal"], - sentieon_ml_tnscope = config_model.sentieon.tnscope_model, - sentieon_exec = config_model.sentieon.sentieon_exec, - sentieon_lic = config_model.sentieon.sentieon_license, + sentieon_exec=config_model.sentieon.sentieon_exec, + sentieon_lic=config_model.sentieon.sentieon_license, case_name = config["analysis"]["case_id"] threads: get_threads(cluster_config, 'sentieon_TNscope') message: - ("Calling SNVs and SVs using Sentieon TNscope and " - "applying machine learning algorithm for sample {params.case_name}") + ("Calling SNVs and SVs using Sentieon TNscope for sample {params.case_name}") shell: """ export TMPDIR={params.tmpdir}; export SENTIEON_TMPDIR={params.tmpdir}; export SENTIEON_LICENSE={params.sentieon_lic}; -intermediate_vcf={params.tmpdir}/tn_sentieon_varcall_file - {params.sentieon_exec} driver \ -t {threads} \ -r {input.ref} \ @@ -55,13 +51,7 @@ intermediate_vcf={params.tmpdir}/tn_sentieon_varcall_file --cosmic {input.cosmic} \ --pcr_indel_mode {params.pcr_model} \ {params.tumor_options} \ -{params.normal_options} $intermediate_vcf; - -{params.sentieon_exec} driver \ --r {input.ref} \ ---algo TNModelApply \ --m {params.sentieon_ml_tnscope} \ --v $intermediate_vcf {output.vcf_tnscope}; +{params.normal_options} {output.vcf_tnscope}; echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap_snv}; cp {output.namemap_snv} {output.namemap_sv} diff --git a/BALSAMIC/snakemake_rules/variant_calling/snv_quality_filter.rule b/BALSAMIC/snakemake_rules/variant_calling/snv_quality_filter.rule new file mode 100644 index 000000000..f06b909b3 --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/snv_quality_filter.rule @@ -0,0 +1,270 @@ + + +if config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["analysis_type"] == 'single': + rule bcftools_quality_filter_tnscope_tumor_only_wgs: + input: + vcf_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + wgs_calling_file = config["reference"]["wgs_calling_regions"], + output: + vcf_snv_research = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", + benchmark: + Path(benchmark_dir, 'bcftools_quality_filter_tnscope_tumor_only_' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + DP = [SNV_FILTER_SETTINGS.DP.tag_value, SNV_FILTER_SETTINGS.DP.filter_name], + AD = [SNV_FILTER_SETTINGS.AD.tag_value, SNV_FILTER_SETTINGS.AD.filter_name], + AF_min = [SNV_FILTER_SETTINGS.AF_min.tag_value, SNV_FILTER_SETTINGS.AF_min.filter_name], + strand_reads = [SNV_FILTER_SETTINGS.strand_reads.tag_value, SNV_FILTER_SETTINGS.strand_reads.filter_name], + qss = [SNV_FILTER_SETTINGS.qss.tag_value, SNV_FILTER_SETTINGS.qss.filter_name], + sor = [SNV_FILTER_SETTINGS.sor.tag_value, SNV_FILTER_SETTINGS.sor.filter_name], + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config, 'bcftools_quality_filter_tnscope_tumor_only') + message: + "Quality filtering WGS tumor-only tnscope variants using bcftools for {params.case_name}" + shell: + """ +export TMPDIR={params.tmpdir}; + +grep -v '^@' {input.wgs_calling_file} > {input.wgs_calling_file}.bed + +bcftools view --threads {threads} --regions-file {input.wgs_calling_file}.bed {input.vcf_snv} \ +| bcftools filter --threads {threads} --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \ +| bcftools filter --threads {threads} --include 'FORMAT/AD[0:1] > {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \ +| bcftools filter --threads {threads} --include 'FORMAT/AF > {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \ +| bcftools filter --threads {threads} --include 'SUM(FORMAT/QSS)/SUM(FORMAT/AD) >= {params.qss[0]}' --soft-filter '{params.qss[1]}' --mode '+' \ +| bcftools filter --threads {threads} --include 'FORMAT/ALT_F1R2 > {params.strand_reads[0]} && (FORMAT/ALT_F1R2 > 0 && FORMAT/ALT_F2R1 > {params.strand_reads[0]} && FORMAT/REF_F1R2 > {params.strand_reads[0]} && FORMAT/REF_F2R1 > {params.strand_reads[0]})' --soft-filter '{params.strand_reads[1]}' --mode '+' \ +| bcftools filter --threads {threads} --include "INFO/SOR < {params.sor[0]}" --soft-filter '{params.sor[1]}' --mode '+' \ +| bcftools view -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -O z -o {output.vcf_snv_research}; + +tabix -p vcf -f {output.vcf_snv_research}; + +rm -rf {params.tmpdir}; + """ + +elif config["analysis"]["sequencing_type"] == 'wgs' and config["analysis"]["analysis_type"] == 'paired': + rule bcftools_quality_filter_tnscope_tumor_normal_wgs: + input: + vcf_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + wgs_calling_file = config["reference"]["wgs_calling_regions"], + output: + vcf_snv_research = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", + benchmark: + Path(benchmark_dir, 'bcftools_quality_filter_tnscope_tumor_normal_' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + AD = [SNV_FILTER_SETTINGS.AD.tag_value, SNV_FILTER_SETTINGS.AD.filter_name], + DP = [SNV_FILTER_SETTINGS.DP.tag_value, SNV_FILTER_SETTINGS.DP.filter_name], + AF_min = [SNV_FILTER_SETTINGS.AF_min.tag_value, SNV_FILTER_SETTINGS.AF_min.filter_name], + high_normal_tumor_af_frac_filter_name=SNV_FILTER_SETTINGS.high_normal_tumor_af_frac.filter_name, + high_normal_tumor_af_frac_value=SNV_FILTER_SETTINGS.high_normal_tumor_af_frac.tag_value, + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config, 'bcftools_quality_filter_tnscope_tumor_normal') + message: + "Quality filtering WGS tumor-normal tnscope variants using bcftools for {params.case_name}" + shell: + """ +bcftools view {input.vcf_snv} \ +| bcftools filter --threads {threads} --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]} || SUM(FORMAT/AD[1:0]+FORMAT/AD[1:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \ +| bcftools filter --threads {threads} --include 'FORMAT/AD[0:1] >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \ +| bcftools filter --threads {threads} --include 'FORMAT/AF[0] >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \ +| bcftools annotate -x FILTER/alt_allele_in_normal \ +| bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \ +| bcftools view -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -O z -o {output.vcf_snv_research}; + +tabix -p vcf -f {output.vcf_snv_research}; + """ + +if config["analysis"]["sequencing_type"] == 'targeted' and config["analysis"]["analysis_type"] == 'single': + + rule bcftools_quality_filter_tnscope_tumor_only_tga: + input: + vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + output: + vcf_filtered = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", + benchmark: + Path(benchmark_dir,'bcftools_quality_filter_vardict_tumor_only_' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + AD=[SNV_FILTER_SETTINGS.AD.tag_value, SNV_FILTER_SETTINGS.AD.filter_name], + DP=[SNV_FILTER_SETTINGS.DP.tag_value, SNV_FILTER_SETTINGS.DP.filter_name], + AF_min=[SNV_FILTER_SETTINGS.AF_min.tag_value, SNV_FILTER_SETTINGS.AF_min.filter_name], + strand_reads=[SNV_FILTER_SETTINGS.strand_reads.tag_value, SNV_FILTER_SETTINGS.strand_reads.filter_name], + qss=[SNV_FILTER_SETTINGS.qss.tag_value, SNV_FILTER_SETTINGS.qss.filter_name], + sor=[SNV_FILTER_SETTINGS.sor.tag_value, SNV_FILTER_SETTINGS.sor.filter_name], + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config,'bcftools_quality_filter_vardict_tumor_only') + message: + "Quality filtering vardict tumor-only annotated variants using bcftools for {params.case_name}" + shell: + """ + bcftools view {input.vcf} \ + | bcftools filter --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \ + | bcftools filter --include 'FORMAT/AD[0:1] >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \ + | bcftools filter --threads {threads} --include 'SUM(FORMAT/QSS)/SUM(FORMAT/AD) >= {params.qss[0]}' --soft-filter '{params.qss[1]}' --mode '+' \ + | bcftools filter --threads {threads} --include "INFO/SOR < {params.sor[0]}" --soft-filter '{params.sor[1]}' --mode '+' \ + | bcftools filter --include 'FORMAT/AF[0] >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \ + | bcftools view -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_filtered} -O z; + + tabix -p vcf -f {output.vcf_filtered}; + """ + + rule bcftools_quality_filter_vardict_tumor_only: + input: + vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", + output: + vcf_filtered=vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.research.vcf.gz", + benchmark: + Path(benchmark_dir,'bcftools_quality_filter_vardict_tumor_only_' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + MQ=[SNV_FILTER_SETTINGS.MQ.tag_value, SNV_FILTER_SETTINGS.MQ.filter_name], + AD=[SNV_FILTER_SETTINGS.AD.tag_value, SNV_FILTER_SETTINGS.AD.filter_name], + DP=[SNV_FILTER_SETTINGS.DP.tag_value, SNV_FILTER_SETTINGS.DP.filter_name], + AF_min=[SNV_FILTER_SETTINGS.AF_min.tag_value, SNV_FILTER_SETTINGS.AF_min.filter_name], + case_name=config["analysis"]["case_id"], + threads: + get_threads(cluster_config,'bcftools_quality_filter_vardict_tumor_only') + message: + "Quality filtering vardict tumor-only annotated variants using bcftools for {params.case_name}" + shell: + """ + bcftools view -f PASS {input.vcf} | \ + bcftools filter --include 'INFO/MQ >= {params.MQ[0]}' --soft-filter '{params.MQ[1]}' --mode '+' | \ + bcftools filter --include 'INFO/DP >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' | \ + bcftools filter --include 'INFO/VD >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' | \ + bcftools filter --include 'INFO/AF >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' | \ + bcftools view -f PASS -o {output.vcf_filtered} -O z; + + tabix -p vcf -f {output.vcf_filtered}; + """ + + rule bcftools_quality_filter_TNscope_umi_tumor_only: + input: + vcf= vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", + output: + vcf_filtered = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.research.vcf.gz" + benchmark: + Path(benchmark_dir,'bcftools_quality_filter_TNscope_umi_tumor_only' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config,'bcftools_quality_filter_tnscope_umi_tumor_only') + message: + "Quality filtering tnscope_umi tumor-only variants using bcftools for {params.case_name}" + shell: + """ + bcftools view {input.vcf} \ + | bcftools view -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_filtered} -O z; + + tabix -p vcf -f {output.vcf_filtered}; + """ + + +elif config["analysis"]["sequencing_type"] == 'targeted' and config["analysis"]["analysis_type"] == 'paired': + + rule bcftools_quality_filter_tnscope_tumor_normal_tga: + input: + vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + output: + vcf_filtered = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", + benchmark: + Path(benchmark_dir,'bcftools_quality_filter_vardict_tumor_normal_' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + AD=[SNV_FILTER_SETTINGS.AD.tag_value, SNV_FILTER_SETTINGS.AD.filter_name], + DP=[SNV_FILTER_SETTINGS.DP.tag_value, SNV_FILTER_SETTINGS.DP.filter_name], + AF_min=[SNV_FILTER_SETTINGS.AF_min.tag_value, SNV_FILTER_SETTINGS.AF_min.filter_name], + high_normal_tumor_af_frac_filter_name=SNV_FILTER_SETTINGS.high_normal_tumor_af_frac.filter_name, + high_normal_tumor_af_frac_value=SNV_FILTER_SETTINGS.high_normal_tumor_af_frac.tag_value, + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config,'bcftools_quality_filter_vardict_tumor_normal') + message: + "Quality filtering vardict tumor-normal variants using bcftools for {params.case_name} " + shell: + """ + bcftools view {input.vcf} \ + | bcftools annotate -x FILTER/alt_allele_in_normal \ + | bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \ + | bcftools filter --include 'SUM(FORMAT/AD[0:0]+FORMAT/AD[0:1]) >= {params.DP[0]} || SUM(FORMAT/AD[1:0]+FORMAT/AD[1:1]) >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' \ + | bcftools filter --include 'FORMAT/AD[0:1] >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' \ + | bcftools filter --include 'FORMAT/AF[0] >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' \ + | bcftools view -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_filtered} -O z; + + tabix -p vcf -f {output.vcf_filtered}; + """ + + rule bcftools_quality_filter_vardict_tumor_normal: + input: + vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", + output: + vcf_filtered = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.research.vcf.gz", + benchmark: + Path(benchmark_dir,'bcftools_quality_filter_vardict_tumor_normal_' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + MQ=[SNV_FILTER_SETTINGS.MQ.tag_value, SNV_FILTER_SETTINGS.MQ.filter_name], + AD=[SNV_FILTER_SETTINGS.AD.tag_value, SNV_FILTER_SETTINGS.AD.filter_name], + DP=[SNV_FILTER_SETTINGS.DP.tag_value, SNV_FILTER_SETTINGS.DP.filter_name], + AF_min=[SNV_FILTER_SETTINGS.AF_min.tag_value, SNV_FILTER_SETTINGS.AF_min.filter_name], + high_normal_tumor_af_frac_filter_name=SNV_FILTER_SETTINGS.high_normal_tumor_af_frac.filter_name, + high_normal_tumor_af_frac_value=SNV_FILTER_SETTINGS.high_normal_tumor_af_frac.tag_value, + possible_germline="balsamic_possible_germline", + case_name=config["analysis"]["case_id"], + threads: + get_threads(cluster_config,'bcftools_quality_filter_vardict_tumor_normal') + message: + "Quality filtering vardict tumor-normal variants using bcftools for {params.case_name} " + shell: + """ + bcftools view {input.vcf} | \ + bcftools filter --include 'SMPL_MIN(FMT/MQ) >= {params.MQ[0]}' --soft-filter '{params.MQ[1]}' --mode + | \ + bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1:0])/sum(FORMAT/AF[0:0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' | \ + bcftools filter --include 'INFO/DP >= {params.DP[0]}' --soft-filter '{params.DP[1]}' --mode '+' | \ + bcftools filter --include 'INFO/VD >= {params.AD[0]}' --soft-filter '{params.AD[1]}' --mode '+' | \ + bcftools filter --include 'INFO/AF >= {params.AF_min[0]}' --soft-filter '{params.AF_min[1]}' --mode '+' | \ + bcftools filter --exclude 'INFO/STATUS ~ "germline/i"' --soft-filter '{params.possible_germline}' --mode '+' | \ + bcftools view -f PASS -o {output.vcf_filtered} -O z; + + tabix -p vcf -f {output.vcf_filtered}; + """ + + rule bcftools_quality_filter_TNscope_umi_tumor_normal: + input: + vcf = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.vcf.gz", + output: + vcf_filtered = vcf_dir + "SNV.somatic."+ config["analysis"]["case_id"] + ".tnscope_umi.research.vcf.gz", + benchmark: + Path(benchmark_dir,'bcftools_quality_filter_TNscope_umi_tumor_normal' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + case_name = config["analysis"]["case_id"], + high_normal_tumor_af_frac_filter_name=SNV_FILTER_SETTINGS.high_normal_tumor_af_frac.filter_name, + high_normal_tumor_af_frac_value=SNV_FILTER_SETTINGS.high_normal_tumor_af_frac.tag_value, + threads: + get_threads(cluster_config,'bcftools_quality_filter_TNscope_umi_tumor_normal') + message: + "Quality filtering TNscope_umi tumor-normal annotated variants using bcftools for {params.case_name} " + shell: + """ + bcftools view {input.vcf} \ + | bcftools annotate -x FILTER/alt_allele_in_normal \ + | bcftools filter --threads {threads} --exclude 'sum(FORMAT/AF[1])/sum(FORMAT/AF[0])>{params.high_normal_tumor_af_frac_value}' --soft-filter '{params.high_normal_tumor_af_frac_filter_name}' --mode '+' \ + | bcftools view -i 'FILTER == "PASS" || FILTER == "triallelic_site"' -o {output.vcf_filtered} -O z; + + tabix -p vcf -f {output.vcf_filtered}; + """ + diff --git a/BALSAMIC/snakemake_rules/variant_calling/snv_t_varcall_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/snv_t_varcall_tga.rule new file mode 100644 index 000000000..33f1c4d13 --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/snv_t_varcall_tga.rule @@ -0,0 +1,155 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +# Variant-calling using TNscope + + + +rule sentieon_tnscope_tga_tumor_only: + input: + bam = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + ref_fa = config["reference"]["reference_genome"], + bed = config["panel"]["capture_kit"], + dbsnp = config["reference"]["dbsnp"] + output: + vcf_tnscope = vcf_dir + "sentieon_tnscope" + "/" + "ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + namemap_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", + namemap_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", + benchmark: + Path(benchmark_dir, "sentieon_tnscope_" + config["analysis"]["case_id"] + ".tsv").as_posix() + params: + algo = params.tnscope_tga.algo, + error_rate = params.tnscope_tga.error_rate, + init_tumor_lod = params.tnscope_tga.init_tumorLOD, + padding = params.tnscope_tga.padding, + pcr_model = params.tnscope_tga.pcr_model, + prune_factor = params.tnscope_tga.prunefactor, + sentieon_exec=config_model.sentieon.sentieon_exec, + sentieon_lic=config_model.sentieon.sentieon_license, + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + tumor = "TUMOR", + tumor_af = params.tnscope_tga.filter_tumor_af, + tumor_lod = params.tnscope_tga.min_tumorLOD, + threads: + get_threads(cluster_config, "sentieon_tnscope_tga_t_only") + message: + "Calling single nucleotide variants using TNscope for {params.tumor}" + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; +export SENTIEON_TMPDIR={params.tmpdir}; +export SENTIEON_LICENSE={params.sentieon_lic}; + +{params.sentieon_exec} driver \ +-t {threads} \ +-r {input.ref_fa} \ +-i {input.bam} \ +--interval {input.bed} \ +--interval_padding {params.padding} \ +--algo {params.algo} \ +--tumor_sample {params.tumor} \ +--dbsnp {input.dbsnp} \ +--trim_soft_clip \ +--min_tumor_allele_frac {params.tumor_af} \ +--filter_t_alt_frac {params.tumor_af} \ +--min_init_tumor_lod {params.init_tumor_lod} \ +--min_tumor_lod {params.tumor_lod} \ +--max_error_per_read {params.error_rate} \ +--pcr_indel_model {params.pcr_model} \ +--prune_factor {params.prune_factor} \ +{output.vcf_tnscope}; + +echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap_snv}; +cp {output.namemap_snv} {output.namemap_sv}; +rm -rf {params.tmpdir}; + """ + + +rule vardict_tumor_only: + input: + fa = config["reference"]["reference_genome"], + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, + output: + temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz") + benchmark: + Path(benchmark_dir, 'vardict_tumor_only_' + '{bedchrom}.tsv').as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("vardict") + ".sif").as_posix() + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + af = params.vardict.allelic_frequency, + max_pval = params.vardict.max_pval, + max_mm = params.vardict.max_mm, + col_info = params.vardict.column_info, + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config, "vardict_tumor_only") + message: + "Calling single nucleotide variants using vardict for {params.case_name}" + shell: + """ +export PERL5LIB=; + +export TMPDIR={params.tmpdir}; +export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx45G\"'; + +vardict-java -I 600 \ +-G {input.fa} \ +-f {params.af} \ +-N {params.case_name} \ +-th {threads} \ +-b {input.bamT} \ +{params.col_info} {input.bed} \ +| teststrandbias.R \ +| var2vcf_valid.pl -P {params.max_pval} \ +-m {params.max_mm} -E -f {params.af} -N {params.case_name} \ +| bgzip > {output}; + +tabix -p vcf {output}; +rm -rf {params.tmpdir}; + """ + +rule post_process_vardict: + input: + vcf_sorted = vcf_dir + "vardict/merged_sorted.vardict.vcf" + output: + vcf_vardict = vcf_dir + "vardict/SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", + namemap=vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.sample_name_map" + params: + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + case_name=config["analysis"]["case_id"], + edit_vcf_script= get_script_path("edit_vcf_info.py"), + variant_caller= "vardict" + benchmark: + Path(benchmark_dir,'vardict_merge_' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + threads: + get_threads(cluster_config,"post_process_vardict") + message: + ("Bgzip, index and reheader merged VarDict vcf and add FOUND_IN for case: {params.case_name}") + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; + +python {params.edit_vcf_script} \ +--input_vcf {input.vcf_sorted} \ +--output_vcf {params.tmpdir}/vardict_merged_sorted.temp.vcf \ +--variant_caller {params.variant_caller}; + +bgzip {params.tmpdir}/vardict_merged_sorted.temp.vcf ; +tabix -f -p vcf {params.tmpdir}/vardict_merged_sorted.temp.vcf.gz ; + +echo \"TUMOR\" > {params.tmpdir}/reheader ; + +bcftools reheader -s {params.tmpdir}/reheader {params.tmpdir}/vardict_merged_sorted.temp.vcf.gz -o {output.vcf_vardict} ; +tabix -f -p vcf {output.vcf_vardict} ; + +echo -e \"tTUMOR\\tTUMOR\" > {output.namemap} ; +echo -e \"tTUMOR\" > {output.namemap}.tumor ; + +rm -rf {params.tmpdir} ; + """ \ No newline at end of file diff --git a/BALSAMIC/snakemake_rules/variant_calling/snv_tn_varcall_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/snv_tn_varcall_tga.rule new file mode 100644 index 000000000..581f6bf21 --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/snv_tn_varcall_tga.rule @@ -0,0 +1,148 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + +# Variant-calling using TNscope and VarDict + + +rule sentieon_tnscope_tga_tumor_normal: + input: + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + ref_fa = config["reference"]["reference_genome"], + bed = config["panel"]["capture_kit"], + dbsnp = config["reference"]["dbsnp"] + output: + vcf_tnscope = vcf_dir + "sentieon_tnscope" + "/" + "ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + namemap_snv = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", + namemap_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tnscope.sample_name_map", + benchmark: + Path(benchmark_dir, "sentieon_tnscope_umi_" + config["analysis"]["case_id"] + ".tsv").as_posix() + params: + algo = params.tnscope_tga.algo, + case_name= config["analysis"]["case_id"], + error_rate = params.tnscope_tga.error_rate, + init_tumor_lod = params.tnscope_tga.init_tumorLOD, + normal = "NORMAL", + padding = params.tnscope_tga.padding, + pcr_model = params.tnscope_tga.pcr_model, + prune_factor = params.tnscope_tga.prunefactor, + sentieon_exec=config_model.sentieon.sentieon_exec, + sentieon_lic=config_model.sentieon.sentieon_license, + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + tumor = "TUMOR", + tumor_af = params.tnscope_tga.filter_tumor_af, + tumor_lod = params.tnscope_tga.min_tumorLOD, + threads: + get_threads(cluster_config, "sentieon_tnscope_tga_tumor_normal") + message: + "Calling single nucleotide variants using TNscope for {params.case_name}" + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; +export SENTIEON_TMPDIR={params.tmpdir}; +export SENTIEON_LICENSE={params.sentieon_lic}; + +{params.sentieon_exec} driver \ +-t {threads} \ +-r {input.ref_fa} \ +-i {input.bamT} \ +-i {input.bamN} \ +--interval {input.bed} \ +--interval_padding {params.padding} \ +--algo {params.algo} \ +--tumor_sample {params.tumor} \ +--normal_sample {params.normal} \ +--dbsnp {input.dbsnp} \ +--trim_soft_clip \ +--min_tumor_allele_frac {params.tumor_af} \ +--filter_t_alt_frac {params.tumor_af} \ +--min_init_tumor_lod {params.init_tumor_lod} \ +--min_tumor_lod {params.tumor_lod} \ +--max_error_per_read {params.error_rate} \ +--pcr_indel_model {params.pcr_model} \ +--prune_factor {params.prune_factor} \ +{output.vcf_tnscope}; + +echo -e \"{params.tumor}\\tTUMOR\\n{params.normal}\\tNORMAL\" > {output.namemap_snv}; +cp {output.namemap_snv} {output.namemap_sv} + """ + +rule vardict_tumor_normal: + input: + fa = config["reference"]["reference_genome"], + bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), + bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), + bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, + output: + temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz") + benchmark: + Path(benchmark_dir,'vardict_tumor_normal_' + "{bedchrom}.tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("vardict") + ".sif").as_posix() + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + af = params.vardict.allelic_frequency, + max_pval = params.vardict.max_pval, + max_mm = params.vardict.max_mm, + col_info = params.vardict.column_info, + case_name = config["analysis"]["case_id"], + threads: + get_threads(cluster_config, "vardict_tumor_normal") + message: + "Calling variants using vardict for {params.case_name}" + shell: + """ +export TMPDIR={params.tmpdir}; +export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx90G\"'; + +vardict-java -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \ +-b \"{input.bamT}|{input.bamN}\" \ +-th {threads} \ +{params.col_info} {input.bed} \ +| testsomatic.R \ +| var2vcf_paired.pl -P {params.max_pval} \ +-m {params.max_mm} -M -f {params.af} -N {params.case_name} \ +| bgzip > {output}; + +tabix -p vcf {output}; +rm -rf {params.tmpdir}; + """ + +rule post_process_vardict: + input: + vcf_sorted = vcf_dir + "vardict/merged_sorted.vardict.vcf" + output: + vcf_vardict = vcf_dir + "vardict/SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", + namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.sample_name_map" + params: + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + case_name=config["analysis"]["case_id"], + benchmark: + Path(benchmark_dir,'post_process_vardict_' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + threads: + get_threads(cluster_config,"post_process_vardict") + message: + ("Bgzip, index and reheader merged VarDict vcf for case: {params.case_name}") + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; + +bgzip {input.vcf_sorted} ; +tabix -f -p vcf {input.vcf_sorted}.gz ; + +echo \"TUMOR\" > {params.tmpdir}/reheader ; +echo \"NORMAL\" >> {params.tmpdir}/reheader ; + +bcftools reheader -s {params.tmpdir}/reheader {input.vcf_sorted}.gz -o {output.vcf_vardict} ; +tabix -f -p vcf {output.vcf_vardict} ; + +echo -e \"tTUMOR\\tTUMOR\\nNORMAL\\tNORMAL\" > {output.namemap} ; +echo -e \"tTUMOR\" > {output.namemap}.tumor ; +echo -e \"nNORMAL\" > {output.namemap}.normal ; + +rm -rf {params.tmpdir} ; + """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule index 622d28024..81b1535ed 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule @@ -61,6 +61,7 @@ rule cnvkit_segment_CNV_research: get_threads(cluster_config, "cnvkit_segment_CNV_research") params: housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), cnv_dir = cnv_dir, case_name = config["analysis"]["case_id"], sample_id = "TUMOR", @@ -71,6 +72,8 @@ rule cnvkit_segment_CNV_research: "Segmenting genomic regions using CNVkit for {params.case_name}" shell: """ +export TMPDIR={params.tmpdir} ; + # Combine the uncorrected target and antitarget coverage tables (.cnn) and # correct for biases in regional coverage and GC content, according to the given normal or PON reference if [[ ! -f "{params.pon}" ]]; then @@ -111,6 +114,8 @@ cnvkit.py segment {output.cnr} \ # Convert copy number segments (initial.cns) to standard SEG format to be used in PureCN cnvkit.py export seg {output.cns_initial} \ --output {output.segment}; + +rm -rf {params.tmpdir}; """ rule purecn_call_CNV_research: @@ -144,8 +149,8 @@ rule purecn_call_CNV_research: "Computing tumor purity and ploidy using PureCN for {params.case_name}" shell: """ -export PURECN='/opt/PureCN/PureCN.R' - +export TMPDIR={params.tmpdir} ; +export PURECN='/opt/PureCN/PureCN.R' ; # Run PureCN to estimate tumor-purity and ploidy @@ -222,6 +227,7 @@ rule cnvkit_call_CNV_research: get_threads(cluster_config, "cnvkit_call_CNV_research") params: housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), cnv_dir = cnv_dir, cnsr = lambda wc: "tumor.merged.cn{s,r}", case_name = config["analysis"]["case_id"], @@ -232,6 +238,8 @@ rule cnvkit_call_CNV_research: "Computing tumor purity and ploidy using PureCN for {params.case_name}" shell: """ +export TMPDIR={params.tmpdir} ; + purity=$(awk -F\\, 'NR>1 {{print $2}}' {input.purity_ploidy}) ploidy=$(awk -F\\, 'NR>1 {{printf int($3)}}' {input.purity_ploidy}) @@ -274,6 +282,8 @@ cnvkit.py export vcf {output.cns} \ -o {output.vcf} \ --sample-id {params.tumor_sample_id} \ --sample-sex {params.gender}; + +rm -rf {params.tmpdir}; """ rule delly_cnv_tumor_normal: @@ -303,6 +313,8 @@ rule delly_cnv_tumor_normal: ("Calling copy number variants using delly for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; + echo -e \"{params.tumor}\\ttumor\\n{params.normal}\\tcontrol\" > {params.tmpdir}/samples.tsv; delly cnv -u -z 10000 -i 10000 -m {input.map} -g {input.fa} -b {input.baits_bed} -c {output.rd_delly} \ @@ -328,7 +340,7 @@ rule bcftools_sort_cnvkitCNV_research: input: vcf = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf", output: - namemap = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.sample_name_map"), + namemap = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.sample_name_map", vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf.gz" benchmark: Path(f"{benchmark_dir}/bcftools_sort_cnvkitCNV_research_{config['analysis']['case_id']}.tsv").as_posix() diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule index 155e630cf..5f91e2815 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule @@ -19,6 +19,7 @@ rule cnvkit_segment_CNV_research: threads: get_threads(cluster_config,"cnvkit_segment_CNV_research") params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, case_name = config["analysis"]["case_id"], sample_id = "TUMOR", @@ -28,6 +29,8 @@ rule cnvkit_segment_CNV_research: ("Segmenting genomic regions using CNVkit for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; + # Combine the uncorrected target and antitarget coverage tables (.cnn) and # correct for biases in regional coverage and GC content, according to the given reference if [[ ! -f "{params.pon}" ]]; then @@ -63,6 +66,8 @@ cnvkit.py segment {output.cnr} \ # Convert copy number segments (initial.cns) to standard SEG format to be used for PureCN cnvkit.py export seg {output.cns_initial} --output {output.segment}; + +rm -rf {params.tmpdir}; """ rule purecn_call_CNV_research: @@ -95,7 +100,8 @@ rule purecn_call_CNV_research: ("Computing tumor purity and ploidy values using PureCN for {params.case_name}") shell: """ -export PURECN='/opt/PureCN/PureCN.R' +export TMPDIR={params.tmpdir} ; +export PURECN='/opt/PureCN/PureCN.R' ; # Run PureCN to estimate tumor-purity and ploidy @@ -172,6 +178,7 @@ rule cnvkit_call_CNV_research: threads: get_threads(cluster_config, "cnvkit_call_CNV_research") params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "cnv"}, cnv_dir = cnv_dir, cnsr = lambda wc: "tumor.merged.cn{s,r}", @@ -183,6 +190,7 @@ rule cnvkit_call_CNV_research: ("Calling CNVs using CNVkit for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; purity=$(awk -F\\, 'NR>1 {{print $2}}' {input.purity_ploidy}) ploidy=$(awk -F\\, 'NR>1 {{printf int($3)}}' {input.purity_ploidy}) @@ -222,6 +230,8 @@ cnvkit.py export vcf {output.cns} \ --output {output.vcf} \ --sample-sex {params.gender} \ --sample-id {params.sample_id}; + +rm -rf {params.tmpdir}; """ rule delly_cnv_tumor_only: @@ -250,6 +260,8 @@ rule delly_cnv_tumor_only: ("Calling copy number variants using delly for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; + delly cnv -i 10000 -m {input.map} -g {input.fa} -b {input.baits_bed} \ -c {output.rd_delly} -o {output.cnv_delly} -l {input.bcf} {input.bamT} @@ -263,7 +275,7 @@ rule bcftools_sort_cnvkitCNV_research: input: vcf = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf", output: - namemap = temp(vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.sample_name_map"), + namemap = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.sample_name_map", vcf = vcf_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".cnvkit.vcf.gz" benchmark: Path(f"{benchmark_dir}/bcftools_sort_cnvkitCNV_research_{config['analysis']['case_id']}.tsv").as_posix() diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal_tga.rule index 97277d019..a31545edd 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal_tga.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal_tga.rule @@ -31,7 +31,8 @@ rule manta_tumor_normal: "index the compressed vcf file") shell: """ -samtools_path=$(readlink -f $(which samtools)) +export TMPDIR={params.tmpdir} ; +samtools_path=$(readlink -f $(which samtools)) ; configManta.py \ {params.settings} \ @@ -83,6 +84,8 @@ rule delly_sv_tumor_normal: "filter somatic variants and finally convert from bcf to compressed vcf file") shell: """ +export TMPDIR={params.tmpdir} ; + delly call -x {input.excl} -o {params.tmpdir}/delly.bcf -g {input.fa} {input.bamT} {input.bamN}; echo -e \"{params.tumor}\\ttumor\\n{params.normal}\\tcontrol\" > {params.tmpdir}/samples.tsv; diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal_wgs.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal_wgs.rule index 57611e836..0fb9b3679 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal_wgs.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_normal_wgs.rule @@ -31,7 +31,8 @@ rule manta_tumor_normal: "index the compressed vcf file") shell: """ -samtools_path=$(readlink -f $(which samtools)) +export TMPDIR={params.tmpdir} ; +samtools_path=$(readlink -f $(which samtools)) ; configManta.py \ {params.settings} \ @@ -58,8 +59,6 @@ echo -e \"{params.normal}\\tNORMAL\\n{params.tumor}\\tTUMOR\" > {output.namemap} rm -rf {params.tmpdir}; """ - - rule delly_sv_tumor_normal: input: fa = config["reference"]["reference_genome"], @@ -85,6 +84,8 @@ rule delly_sv_tumor_normal: "filter somatic variants and finally convert from bcf to compressed vcf file") shell: """ +export TMPDIR={params.tmpdir} ; + delly call -x {input.excl} -o {params.tmpdir}/delly.bcf -g {input.fa} {input.bamT} {input.bamN}; echo -e \"{params.tumor}\\ttumor\\n{params.normal}\\tcontrol\" > {params.tmpdir}/samples.tsv; @@ -123,6 +124,8 @@ rule delly_cnv_tumor_normal: ("Calling copy number variants using delly for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; + echo -e \"{params.tumor}\\ttumor\\n{params.normal}\\tcontrol\" > {params.tmpdir}/samples.tsv; delly cnv -u -z 10000 -m {input.map} -g {input.fa} -c {output.rd_delly} \ @@ -179,6 +182,7 @@ rule ascat_tumor_normal: ("Calling copy number variants using ascatNGS for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; export LD_LIBRARY_PATH=:/opt/wtsi-cgp/lib; if [[ "{params.gender}" = "female" ]]; then gender="XX"; else gender="XY"; fi @@ -249,6 +253,8 @@ rule tiddit_sv_tumor_normal: ("Calling structural variants using tiddit for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; + tiddit --cov -z 500 --ref {input.fa} --bam {input.bamT} -o {params.tmpdir}/{params.tumor}_cov & tiddit --cov -z 500 --ref {input.fa} --bam {input.bamN} -o {params.tmpdir}/{params.normal}_cov & diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only_tga.rule index 3902ea24c..350483a3f 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only_tga.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only_tga.rule @@ -28,7 +28,8 @@ rule manta_tumor_only: "index the compressed vcf file") shell: """ -samtools_path=$(readlink -f $(which samtools)) +export TMPDIR={params.tmpdir} ; +samtools_path=$(readlink -f $(which samtools)) ; configManta.py \ {params.settings} \ @@ -76,6 +77,8 @@ rule delly_sv_tumor_only: ("Calling structural variants using delly for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; + delly call -x {input.excl} -o {output.bcf} -g {input.fa} {input.bamT} echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only_wgs.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only_wgs.rule index 7661e8eae..5dc40ad50 100644 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only_wgs.rule +++ b/BALSAMIC/snakemake_rules/variant_calling/somatic_sv_tumor_only_wgs.rule @@ -28,7 +28,8 @@ rule manta_tumor_only: "index the compressed vcf file") shell: """ -samtools_path=$(readlink -f $(which samtools)) +export TMPDIR={params.tmpdir} ; +samtools_path=$(readlink -f $(which samtools)) ; configManta.py \ {params.settings} \ @@ -77,6 +78,8 @@ rule delly_sv_tumor_only: ("Calling structural variants using delly for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; + delly call -x {input.excl} -o {output.bcf} -g {input.fa} {input.bamT} echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; @@ -110,6 +113,8 @@ rule delly_cnv_tumor_only: ("Calling copy number variants using delly for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; + delly cnv -m {input.map} -g {input.fa} -c {output.rd_delly} -o {output.cnv_delly} -l {input.bcf} {input.bamT} echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; @@ -140,6 +145,8 @@ rule tiddit_sv_tumor_only: ("Calling structural variants using tiddit for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; + tiddit --cov -z 500 --ref {input.fa} --bam {input.bamT} -o {params.tmpdir}/{params.tumor}_cov & tiddit --sv -p 6 -r 6 -z 1000 --ref {input.fa} --bam {input.bamT} -o {params.tmpdir}/{params.tumor}; @@ -179,6 +186,7 @@ rule cnvpytor_tumor_only: ("Calling copy number variants using cnvpytor for {params.case_name}") shell: """ +export TMPDIR={params.tmpdir} ; export tumor={params.tumor}; export tumor_file={params.tmpdir}/$tumor diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule deleted file mode 100644 index 5e51255cd..000000000 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_normal.rule +++ /dev/null @@ -1,79 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - - - -rule vardict_tumor_normal: - input: - fa = config["reference"]["reference_genome"], - bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample), - bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), - bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, - output: - temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz") - benchmark: - Path(benchmark_dir,'vardict_tumor_normal_' + "{bedchrom}.tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("vardict") + ".sif").as_posix() - params: - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - af = params.vardict.allelic_frequency, - max_pval = params.vardict.max_pval, - max_mm = params.vardict.max_mm, - col_info = params.vardict.column_info, - case_name = config["analysis"]["case_id"], - threads: - get_threads(cluster_config, "vardict_tumor_normal") - message: - "Calling variants using vardict for {params.case_name}" - shell: - """ -export TMPDIR={params.tmpdir}; -export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx90G\"'; - -vardict-java -I 600 -G {input.fa} -f {params.af} -N {params.case_name} \ --b \"{input.bamT}|{input.bamN}\" \ --th {threads} \ -{params.col_info} {input.bed} \ -| testsomatic.R \ -| var2vcf_paired.pl -P {params.max_pval} \ --m {params.max_mm} -M -f {params.af} -N {params.case_name} \ -| bgzip > {output}; - -tabix -p vcf {output}; -rm -rf {params.tmpdir}; - """ - - -rule vardict_merge: - input: - expand(vcf_dir + "vardict/split_vcf/{chrom}_vardict.vcf.gz", chrom=chromlist) - output: - vcf_vardict = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", - yaml = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.yaml", - namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.sample_name_map" - params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - case_name = config["analysis"]["case_id"], - benchmark: - Path(benchmark_dir,'vardict_merge_' + config["analysis"]["case_id"] + ".tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - threads: - get_threads(cluster_config,"vardict_merge") - message: - ("Merging multiple VCFs from vardict into single VCF using bcftools for {params.case_name}") - shell: - """ -bcftools concat {input} | bcftools sort --temp-dir {params.tmpdir} - | bgzip > {output.vcf_vardict}; -tabix -f -p vcf {output.vcf_vardict}; - -echo -e \"{params.case_name}\\tTUMOR\\n{params.case_name}-match\\tNORMAL\" > {output.namemap}; -echo -e \"{params.case_name}\" > {output.namemap}.tumor; -echo -e \"{params.case_name}-match\" > {output.namemap}.normal; -echo '{{ vcf: {{ vardict: {{ name: vardict, path: {output.vcf_vardict} }} }} }}' > {output.yaml}; - -rm -rf {params.tmpdir}; - """ - diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule deleted file mode 100644 index 2dc077289..000000000 --- a/BALSAMIC/snakemake_rules/variant_calling/somatic_tumor_only.rule +++ /dev/null @@ -1,126 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - -rule vardict_tumor_only: - input: - fa = config["reference"]["reference_genome"], - bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), - bed = vcf_dir + "split_bed/{bedchrom}." + capture_kit, - output: - temp(vcf_dir + "vardict/split_vcf/{bedchrom}_vardict.vcf.gz") - benchmark: - Path(benchmark_dir, 'vardict_tumor_only_' + '{bedchrom}.tsv').as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("vardict") + ".sif").as_posix() - params: - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - af = params.vardict.allelic_frequency, - max_pval = params.vardict.max_pval, - max_mm = params.vardict.max_mm, - col_info = params.vardict.column_info, - case_name = config["analysis"]["case_id"], - threads: - get_threads(cluster_config, "vardict_tumor_only") - message: - "Calling single nucleotide variants using vardict for {params.case_name}" - shell: - """ -export PERL5LIB=; - -export TMPDIR={params.tmpdir}; -export VAR_DICT_OPTS='\"-Djava.io.tmpdir={params.tmpdir}\" \"-Xmx45G\"'; - -vardict-java -I 600 \ --G {input.fa} \ --f {params.af} \ --N {params.case_name} \ --th {threads} \ --b {input.bamT} \ -{params.col_info} {input.bed} \ -| teststrandbias.R \ -| var2vcf_valid.pl -P {params.max_pval} \ --m {params.max_mm} -E -f {params.af} -N {params.case_name} \ -| bgzip > {output}; - -tabix -p vcf {output}; -rm -rf {params.tmpdir}; - """ - - -rule vardict_merge: - input: - expand(vcf_dir + "vardict/split_vcf/{chrom}_vardict.vcf.gz", chrom=chromlist) - output: - namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.sample_name_map", - yaml = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.yaml", - vcf_vardict = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz" - benchmark: - Path(benchmark_dir, 'vardict_merge_' + config["analysis"]["case_id"] + ".tsv").as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() - params: - housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - case_name = config["analysis"]["case_id"], - threads: - get_threads(cluster_config,"vardict_merge") - message: - ("Merging multiple VCFs from vardict into single VCF using bcftools for {params.case_name}") - shell: - """ -export TMPDIR={params.tmpdir}; - -bcftools concat {input} \ -| bcftools sort --temp-dir {params.tmpdir} - \ -| bgzip > {output.vcf_vardict}; -tabix -f -p vcf {output.vcf_vardict}; - -echo -e \"{params.case_name}\\tTUMOR\" > {output.namemap}; -echo -e \"{params.case_name}\" > {output.namemap}.tumor; -echo '{{ vcf: {{ vardict: {{ name: vardict , path: {output.vcf_vardict} }} }} }}' > {output.yaml}; - -rm -rf {params.tmpdir}; - """ - - -rule sentieon_TNhaplotyper_tumor_only: - input: - bam = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample), - ref = config["reference"]["reference_genome"], - dbsnp = config["reference"]["dbsnp"], - cosmic = config["reference"]["cosmic"], - interval = config["panel"]["capture_kit"], - output: - vcf = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnhaplotyper.research.vcf.gz", - namemap = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnhaplotyper.sample_name_map", - benchmark: - Path(benchmark_dir,'sentieon_TNhaplotyper_tumor_only_' + config["analysis"]["case_id"] + ".tsv").as_posix() - params: - tumor = "TUMOR", - tmpdir= tempfile.mkdtemp(prefix=tmp_dir), - sentieon_exec = config_model.sentieon.sentieon_exec, - sentieon_lic = config_model.sentieon.sentieon_license, - case_name = config["analysis"]["case_id"] - threads: - get_threads(cluster_config, 'sentieon_TNhaplotyper_tumor_only') - message: - "Calling variants using TNhaplotyper for sample {params.case_name}" - shell: - """ -export TMPDIR={params.tmpdir}; -export SENTIEON_TMPDIR={params.tmpdir}; -export SENTIEON_LICENSE={params.sentieon_lic}; - -{params.sentieon_exec} driver \ --r {input.ref} \ --t {threads} \ --i {input.bam} \ ---interval {input.interval} \ ---algo TNhaplotyper \ ---tumor_sample {params.tumor} \ ---cosmic {input.cosmic} \ ---dbsnp {input.dbsnp} {output.vcf}; - -echo -e \"{params.tumor}\\tTUMOR\" > {output.namemap}; -rm -rf {params.tmpdir}; - """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule b/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule deleted file mode 100644 index 7c19c0b80..000000000 --- a/BALSAMIC/snakemake_rules/variant_calling/split_bed.rule +++ /dev/null @@ -1,39 +0,0 @@ -# vim: syntax=python tabstop=4 expandtab -# coding: utf-8 - - - -rule bedtools_splitbed_by_chrom: - input: - bed = config["panel"]["capture_kit"], - chrom = config["reference"]["genome_chrom_size"], - bam = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample) - output: - bed = expand(vcf_dir + "split_bed/" + "{chrom}." + capture_kit, chrom=chromlist) - benchmark: - Path(benchmark_dir, 'bedtools_splitbed_by_chrom.tsv').as_posix() - singularity: - Path(singularity_image, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix() - params: - tmpdir = tempfile.mkdtemp(prefix=tmp_dir), - split_bed_dir = vcf_dir + "split_bed/", - origin_bed = capture_kit, - message: - ("Splitting the panel bed per chromosome, flanking regions by 100bp and merging into single VCF using bedtools") - shell: - """ -mkdir -p {params.tmpdir}; -export TMPDIR={params.tmpdir}; - -chromlist=`cut -f 1 {input.bed} | sort -u`; -sed 's/^chr//g;/_/d' {input.chrom} | sort -k1,1 > {params.split_bed_dir}hg19.chrom.sizes; - -for c in $chromlist; do \ -awk -v C=$c '$1==C' {input.bed} \ -| bedtools slop -b 100 -i - -g {params.split_bed_dir}hg19.chrom.sizes \ -| sort -k1,1 -k2,2n \ -| bedtools merge > {params.split_bed_dir}$c.{params.origin_bed}; done; - -unset chromlist; -readlink -f {input.bam}; - """ diff --git a/BALSAMIC/snakemake_rules/variant_calling/tnscope_post_process.rule b/BALSAMIC/snakemake_rules/variant_calling/tnscope_post_process.rule new file mode 100644 index 000000000..965edfb39 --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/tnscope_post_process.rule @@ -0,0 +1,65 @@ +# vim: syntax=python tabstop=4 expandtab +# coding: utf-8 + + + +rule bcftools_split_tnscope_variants: + input: + ref = config["reference"]["reference_genome"], + vcf = vcf_dir + "sentieon_tnscope/ALL.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + output: + vcf_tnscope = vcf_dir + "sentieon_tnscope/SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.preprocess.vcf", + vcf_tnscope_sv = vcf_dir + "SV.somatic." + config["analysis"]["case_id"] + ".tnscope.research.vcf.gz", + benchmark: + Path(benchmark_dir,'bcftools_split_tnscope_variants_' + config[ "analysis" ][ "case_id" ] + ".tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + case_name = config["analysis"]["case_id"] + threads: + get_threads(cluster_config, 'bcftools_split_tnscope_variants') + message: + "Split tnscope snv and sv variants using bcftools for {params.case_name}" + shell: + """ +export TMPDIR={params.tmpdir}; +mkdir -p {params.tmpdir}; + +bcftools view --include 'INFO/SVTYPE=="."' -o {output.vcf_tnscope} {input.vcf} ; +bcftools view --include 'INFO/SVTYPE!="."' -O z -o {output.vcf_tnscope_sv} {input.vcf}; +tabix -p vcf -f {output.vcf_tnscope_sv}; + """ + +rule modify_tnscope_infofield: + input: + vcf_tnscope = vcf_dir + "sentieon_tnscope/SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.preprocess.vcf", + output: + vcf_tnscope = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".tnscope.vcf.gz", + benchmark: + Path(benchmark_dir,'modify_tnscope_infofield_' + config[ "analysis" ][ "case_id" ] + ".tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, + modify_tnscope_infofield = get_script_path("modify_tnscope_infofield.py"), + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + case_name = config["analysis"]["case_id"], + edit_vcf_script = get_script_path("edit_vcf_info.py"), + variant_caller= "tnscope" + threads: + get_threads(cluster_config, 'modify_tnscope_infofield') + message: + "Add DP and AF tumor sample info and FOUND_IN to INFO field for case: {params.case_name}" + shell: + """ +export TMPDIR={params.tmpdir}; +mkdir -p {params.tmpdir}; + +python {params.modify_tnscope_infofield} {input.vcf_tnscope} {params.tmpdir}/vcf_tnscope_snvs_modified.vcf ; +python {params.modify_tnscope_infofield} {params.tmpdir}/vcf_tnscope_snvs_modified.vcf {params.tmpdir}/vcf_tnscope_snvs_modified_found_in_added.vcf ; +bgzip {params.tmpdir}/vcf_tnscope_snvs_modified_found_in_added.vcf ; +mv {params.tmpdir}/vcf_tnscope_snvs_modified_found_in_added.vcf.gz {output.vcf_tnscope} ; +tabix -p vcf -f {output.vcf_tnscope} ; + """ + diff --git a/BALSAMIC/snakemake_rules/variant_calling/vardict_pre_and_postprocessing.rule b/BALSAMIC/snakemake_rules/variant_calling/vardict_pre_and_postprocessing.rule new file mode 100644 index 000000000..b17e6c7b2 --- /dev/null +++ b/BALSAMIC/snakemake_rules/variant_calling/vardict_pre_and_postprocessing.rule @@ -0,0 +1,116 @@ + + +rule bedtools_splitbed_by_chrom: + input: + bed = config["panel"]["capture_kit"], + chrom = config["reference"]["genome_chrom_size"], + bam = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample) + output: + bed = expand(vcf_dir + "split_bed/" + "{chrom}." + capture_kit, chrom=chromlist) + benchmark: + Path(benchmark_dir, 'bedtools_splitbed_by_chrom.tsv').as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix() + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + split_bed_dir = vcf_dir + "split_bed/", + origin_bed = capture_kit, + message: + ("Splitting the panel bed per chromosome, flanking regions by 100bp and merging into single VCF using bedtools") + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; + +chromlist=`cut -f 1 {input.bed} | sort -u`; +sed 's/^chr//g;/_/d' {input.chrom} | sort -k1,1 > {params.split_bed_dir}hg19.chrom.sizes; + +for c in $chromlist; do \ +awk -v C=$c '$1==C' {input.bed} \ +| bedtools slop -b 100 -i - -g {params.split_bed_dir}hg19.chrom.sizes \ +| sort -k1,1 -k2,2n \ +| bedtools merge > {params.split_bed_dir}$c.{params.origin_bed}; done; + +unset chromlist; +readlink -f {input.bam}; + """ + +rule vardict_merge: + input: + expand(vcf_dir + "vardict/split_vcf/{chrom}_vardict.vcf.gz", chrom=chromlist) + output: + vcf_vardict = vcf_dir + "vardict/merged.vardict.vcf" + params: + tmpdir = tempfile.mkdtemp(prefix=tmp_dir), + sort_vcf = get_script_path("sort_vcf.awk"), + case_name = config["analysis"]["case_id"], + benchmark: + Path(benchmark_dir,'vardict_merge_' + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image, config["bioinfo_tools"].get("bcftools") + ".sif").as_posix() + threads: + get_threads(cluster_config,"vardict_merge") + message: + ("Merging multiple VCFs from vardict into single VCF using bcftools for {params.case_name}") + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; + +bcftools concat {input} > {output.vcf_vardict} ; + +rm -rf {params.tmpdir} ; + """ + +rule vardict_sort: + input: + vcf = vcf_dir + "vardict/merged.vardict.vcf" + output: + vcf_sorted = vcf_dir + "vardict/merged_sorted.vardict.vcf" + params: + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + sort_vcf=get_script_path("sort_vcf.awk"), + case_name=config["analysis"]["case_id"], + benchmark: + Path(benchmark_dir, 'vardict_sort_' + config["analysis"]["case_id"] + ".tsv").as_posix() + threads: + get_threads(cluster_config,"vardict_sort") + message: + ("Sorting merged vardict files with awk {params.case_name}") + shell: + """ +mkdir -p {params.tmpdir}; +export TMPDIR={params.tmpdir}; + +awk -f {params.sort_vcf} {input.vcf} > {output.vcf_sorted} + +rm -rf {params.tmpdir} ; + """ + +rule gatk_update_vcf_sequence_dictionary: + input: + ref = config["reference"]["reference_genome"], + vcf = vcf_dir + "vardict/SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", + output: + vcf_vardict = vcf_dir + "SNV.somatic." + config["analysis"]["case_id"] + ".vardict.vcf.gz", + params: + housekeeper_id = {"id": config["analysis"]["case_id"], "tags": "research"}, + tmpdir=tempfile.mkdtemp(prefix=tmp_dir), + benchmark: + Path(benchmark_dir,"gatk_update_vcf_sequence_dictionary" + config["analysis"]["case_id"] + ".tsv").as_posix() + singularity: + Path(singularity_image,config["bioinfo_tools"].get("gatk") + ".sif").as_posix() + threads: + get_threads(cluster_config,"gatk_collectreadcounts") + message: + "Running GATK UpdateVCFSequenceDictionary on VarDict VCF." + shell: + """ +export TMPDIR={params.tmpdir}; + +ref=$(echo {input.ref} | sed 's/.fasta/.dict/g') ; + +gatk UpdateVCFSequenceDictionary -V {input.vcf} --source-dictionary $ref --replace --output {output.vcf_vardict} ; + +rm -rf {params.tmpdir} + """ \ No newline at end of file diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk index b02ffd98d..c742d9435 100644 --- a/BALSAMIC/workflows/balsamic.smk +++ b/BALSAMIC/workflows/balsamic.smk @@ -17,12 +17,10 @@ from BALSAMIC.constants.analysis import ( from BALSAMIC.constants.paths import BALSAMIC_DIR from BALSAMIC.constants.rules import SNAKEMAKE_RULES from BALSAMIC.constants.variant_filters import ( - COMMON_SETTINGS, - SENTIEON_VARCALL_SETTINGS, + SNV_BCFTOOLS_SETTINGS_PANEL, + SNV_BCFTOOLS_SETTINGS_EXOME, + SNV_BCFTOOLS_SETTINGS_WGS, SVDB_FILTER_SETTINGS, - VARDICT_SETTINGS_PANEL, - VARDICT_SETTINGS_EXOME, - VARDICT_SETTINGS_COMMON, MANTA_FILTER_SETTINGS, ) from BALSAMIC.constants.workflow_params import ( @@ -125,16 +123,17 @@ else: status_to_sample_id = "TUMOR" + "\\\\t" + tumor_sample -# Varcaller filter settings -COMMON_FILTERS = VarCallerFilter.model_validate(COMMON_SETTINGS) +# Set SNV filter settings depending on if sample is panel / wes / wgs +if config_model.panel: + if config_model.panel.exome: + SNV_FILTER_SETTINGS = VarCallerFilter.model_validate(SNV_BCFTOOLS_SETTINGS_EXOME) + else: + SNV_FILTER_SETTINGS = VarCallerFilter.model_validate(SNV_BCFTOOLS_SETTINGS_PANEL) +else: + SNV_FILTER_SETTINGS = VarCallerFilter.model_validate(SNV_BCFTOOLS_SETTINGS_WGS) -# Set VarDict settings depending on if panel is exome or not -VARDICT = VarCallerFilter.model_validate(VARDICT_SETTINGS_PANEL) -if config_model.panel and config_model.panel.exome: - VARDICT = VarCallerFilter.model_validate(VARDICT_SETTINGS_EXOME) -SENTIEON_CALLER = VarCallerFilter.model_validate(SENTIEON_VARCALL_SETTINGS) SVDB_FILTERS = VarCallerFilter.model_validate(SVDB_FILTER_SETTINGS) MANTA_FILTERS = VarCallerFilter.model_validate(MANTA_FILTER_SETTINGS) @@ -367,155 +366,127 @@ else: f"{cnv_dir}CNV.somatic.{config['analysis']['case_id']}.purecn.purity.csv.pdf" ) + +# Collect all rules to be run + +rules_to_include = [] +analysis_type = config["analysis"]["analysis_type"] +sequence_type = config["analysis"]["sequencing_type"] + +for sub, value in SNAKEMAKE_RULES.items(): + if sub in ["common", analysis_type + "_" + sequence_type]: + for module_name, module_rules in value.items(): + rules_to_include.extend(module_rules) + +if config["analysis"]["analysis_workflow"] == "balsamic": + rules_to_include = [rule for rule in rules_to_include if "umi" not in rule] + +# Add rule for DRAGEN +if "dragen" in config: + rules_to_include.append("snakemake_rules/concatenation/concatenation.rule") + +# Add rule for GENS +if "gnomad_min_af5" in config["reference"]: + rules_to_include.append("snakemake_rules/variant_calling/gens_preprocessing.rule") +if "gnomad_min_af5" in config["reference"] and sequence_type == SequencingType.WGS: + rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule") + +LOG.info(f"The following rules will be included in the workflow: {rules_to_include}") + +# If workflow includes UMI filtered results, add these results +wf_solutions = ["BALSAMIC", "Sentieon"] +if config["analysis"]["analysis_workflow"] == "balsamic-umi": + wf_solutions.append("Sentieon_umi") + # Extract variant callers for the workflow -germline_caller = [] -somatic_caller = [] +germline_caller_snv = [] +germline_caller_sv = [] +germline_caller_cnv = [] +somatic_caller_snv = [] somatic_caller_cnv = [] somatic_caller_sv = [] -for m in set(MutationType): - germline_caller_balsamic = get_variant_callers( + +# Collect list of variant callers to be run +for ws in wf_solutions: + somatic_caller_snv += get_variant_callers( config=config, analysis_type=config["analysis"]["analysis_type"], - workflow_solution="BALSAMIC", - mutation_type=m, + workflow_solution=ws, + mutation_type="SNV", sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="germline", + mutation_class="somatic", ) - - germline_caller_sentieon = get_variant_callers( + somatic_caller_sv += get_variant_callers( config=config, analysis_type=config["analysis"]["analysis_type"], - workflow_solution="Sentieon", - mutation_type=m, + workflow_solution=ws, + mutation_type="SV", sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="germline", - ) - - germline_caller = ( - germline_caller + germline_caller_balsamic + germline_caller_sentieon + mutation_class="somatic", ) - - somatic_caller_balsamic = get_variant_callers( + somatic_caller_cnv += get_variant_callers( config=config, analysis_type=config["analysis"]["analysis_type"], - workflow_solution="BALSAMIC", - mutation_type=m, + workflow_solution=ws, + mutation_type="CNV", sequencing_type=config["analysis"]["sequencing_type"], mutation_class="somatic", ) - - somatic_caller_sentieon = get_variant_callers( + germline_caller_snv += get_variant_callers( config=config, analysis_type=config["analysis"]["analysis_type"], - workflow_solution="Sentieon", - mutation_type=m, + workflow_solution=ws, + mutation_type="SNV", sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic", + mutation_class="germline", ) - - somatic_caller_sentieon_umi = get_variant_callers( + germline_caller_sv += get_variant_callers( config=config, analysis_type=config["analysis"]["analysis_type"], - workflow_solution="Sentieon_umi", - mutation_type=m, + workflow_solution=ws, + mutation_type="SV", sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic", - ) - somatic_caller = ( - somatic_caller - + somatic_caller_sentieon_umi - + somatic_caller_balsamic - + somatic_caller_sentieon + mutation_class="germline", ) - -somatic_caller_sv = get_variant_callers( - config=config, - analysis_type=config["analysis"]["analysis_type"], - workflow_solution="BALSAMIC", - mutation_type="SV", - sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic", -) - -somatic_caller_cnv = get_variant_callers( - config=config, - analysis_type=config["analysis"]["analysis_type"], - workflow_solution="BALSAMIC", - mutation_type="CNV", - sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic", -) -somatic_caller_sv.remove("svdb") -svdb_callers_prio = somatic_caller_sv + somatic_caller_cnv - -for var_caller in svdb_callers_prio: - if var_caller in somatic_caller: - somatic_caller.remove(var_caller) - -# Collect only snv callers for calculating tmb -somatic_caller_tmb = [] -for ws in ["BALSAMIC", "Sentieon", "Sentieon_umi"]: - somatic_caller_snv = get_variant_callers( + germline_caller_cnv += get_variant_callers( config=config, analysis_type=config["analysis"]["analysis_type"], workflow_solution=ws, - mutation_type="SNV", + mutation_type="CNV", sequencing_type=config["analysis"]["sequencing_type"], - mutation_class="somatic", + mutation_class="germline", ) - somatic_caller_tmb += somatic_caller_snv - -# Remove variant callers from list of callers -if "disable_variant_caller" in config: - variant_callers_to_remove = config["disable_variant_caller"].split(",") - for var_caller in variant_callers_to_remove: - if var_caller in somatic_caller: - somatic_caller.remove(var_caller) - if var_caller in germline_caller: - germline_caller.remove(var_caller) - -rules_to_include = [] -analysis_type = config["analysis"]["analysis_type"] -sequence_type = config["analysis"]["sequencing_type"] - -for sub, value in SNAKEMAKE_RULES.items(): - if sub in ["common", analysis_type + "_" + sequence_type]: - for module_name, module_rules in value.items(): - rules_to_include.extend(module_rules) - -if config["analysis"]["analysis_workflow"] == "balsamic": - rules_to_include = [rule for rule in rules_to_include if "umi" not in rule] - somatic_caller = [ - var_caller for var_caller in somatic_caller if "umi" not in var_caller - ] - somatic_caller_tmb = [ - var_caller for var_caller in somatic_caller_tmb if "umi" not in var_caller - ] - -# Add rule for DRAGEN -if "dragen" in config: - rules_to_include.append("snakemake_rules/concatenation/concatenation.rule") - -# Add rule for GENS -if "gnomad_min_af5" in config["reference"]: - rules_to_include.append("snakemake_rules/variant_calling/gens_preprocessing.rule") -if "gnomad_min_af5" in config["reference"] and sequence_type == SequencingType.WGS: - rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule") - -LOG.info(f"The following rules will be included in the workflow: {rules_to_include}") LOG.info( - f"The following Germline variant callers will be included in the workflow: {germline_caller}" + f"The following Somatic SNV variant callers will be included in the workflow: {somatic_caller_snv}" ) LOG.info( - f"The following somatic variant callers will be included in the workflow: {somatic_caller}" + f"The following Somatic SV variant callers will be included in the workflow: {somatic_caller_sv}" +) +LOG.info( + f"The following Somatic CNV variant callers will be included in the workflow: {somatic_caller_cnv}" +) +LOG.info( + f"The following Germline SNV variant callers will be included in the workflow: {germline_caller_snv}" +) +LOG.info( + f"The following Germline SV variant callers will be included in the workflow: {germline_caller_sv}" +) +LOG.info( + f"The following Germline CNV variant callers will be included in the workflow: {germline_caller_cnv}" ) +somatic_caller_sv.remove("svdb") +svdb_callers_prio = somatic_caller_sv + somatic_caller_cnv -for r in rules_to_include: - - include: Path(BALSAMIC_DIR, r).as_posix() - +somatic_caller = somatic_caller_snv + ["svdb"] +final_somatic_snv_caller = somatic_caller_snv +if config["analysis"]["sequencing_type"] != "wgs": + remove_caller_list = ["tnscope", "vardict"] + for remove_caller in remove_caller_list: + if remove_caller in somatic_caller: + somatic_caller.remove(remove_caller) + final_somatic_snv_caller.remove(remove_caller) # Define common and analysis specific outputs quality_control_results = [ @@ -528,6 +499,7 @@ quality_control_results = [ analysis_specific_results = [] # Germline SNVs/SVs +germline_caller = germline_caller_cnv + germline_caller_sv + germline_caller_snv analysis_specific_results.extend( expand( vep_dir + "{vcf}.vcf.gz", @@ -535,10 +507,15 @@ analysis_specific_results.extend( ) ) +for r in rules_to_include: + include: Path(BALSAMIC_DIR, r).as_posix() + + # Germline SNVs specifically for genotype if config["analysis"]["analysis_type"] == "paired": analysis_specific_results.append(vep_dir + "SNV.genotype.normal.dnascope.vcf.gz") + # Raw VCFs analysis_specific_results.extend( expand( @@ -555,6 +532,14 @@ analysis_specific_results.extend( ) ) +# Scored clinical SNV VCFs +analysis_specific_results.extend( + expand( + vep_dir + "{vcf}.clinical.scored.vcf.gz", + vcf=get_vcf(config, final_somatic_snv_caller, [case_id]), + ) +) + # Filtered and passed post annotation clinical VCFs analysis_specific_results.extend( expand( @@ -565,10 +550,11 @@ analysis_specific_results.extend( # TMB +somatic_caller.remove("svdb") analysis_specific_results.extend( expand( vep_dir + "{vcf}.balsamic_stat", - vcf=get_vcf(config, somatic_caller_tmb, [case_id]), + vcf=get_vcf(config, somatic_caller, [case_id]), ) ) @@ -584,20 +570,11 @@ if config["analysis"]["sequencing_type"] != "wgs": ) analysis_specific_results.append(cnv_dir + case_id + ".gene_metrics") # vcf2cytosure - analysis_specific_results.extend( - expand( - vcf_dir + "CNV.somatic.{case_name}.{var_caller}.vcf2cytosure.cgh", - case_name=case_id, - var_caller=["cnvkit"], - ) - ) - # VarDict - analysis_specific_results.extend( - expand( - vep_dir + "{vcf}.research.filtered.pass.ranked.vcf.gz", - vcf=get_vcf(config, ["vardict"], [case_id]), - ) - ) + analysis_specific_results.extend(expand( + vcf_dir + "CNV.somatic.{case_name}.{var_caller}.vcf2cytosure.cgh", + case_name=case_id, + var_caller=["cnvkit"] + )) # UMI if config["analysis"]["analysis_workflow"] == "balsamic-umi": analysis_specific_results.extend(expand(umi_qc_dir + "tumor.{sample}.umi.mean_family_depth", sample=tumor_sample)) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index db224a89f..082c87a51 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,7 +12,12 @@ Added: * Padding of bed-regions for CNVkit to minimum 100 bases https://github.com/Clinical-Genomics/BALSAMIC/pull/1469 * Added min mapq 20 to CNVkit PON workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1465 * CNVkit PONs for Exome comprehensive 10.2, GMSsolid 15.2, GMCKsolid 4.2 https://github.com/Clinical-Genomics/BALSAMIC/pull/1465 -* Sentieon install directory path to case config arguments https://giithub.com/Clinical-Genomics/BALSAMIC/pull/1461 +* Merged VarDict with TNscope in all TGA workflows https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* New filter for VarDict for tumor in normal contamination https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* Export TMP environment variables to rules that lack them https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* Added genmod ranked VCFs to be delivered https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* Added family-id to genmod in order to get ranked variants to Scout https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* Added Raw TNscope calls and unfiltered research-annotated SNVs to delivery https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 Changed: ^^^^^^^^ @@ -22,6 +27,12 @@ Changed: * Upgrade `vcf2cytosure` version to 0.9.1 and remove hardcoded versions https://github.com/Clinical-Genomics/BALSAMIC/pull/1456 * Create new PONs for GMCKSolid v4.1, GMSMyeloid v5.3, and GMSlymphoid v7.3 https://github.com/Clinical-Genomics/BALSAMIC/pull/1465 * Refactored CNVkit rules https://github.com/Clinical-Genomics/BALSAMIC/pull/1465 +* Refactored BCFtools filter rules https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* Renamed final UMI bamfile to ensure hsmetrics is picked up by multiqc https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* Changed ranking model VCF from research to clinical https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* Lowered minimum AF for TGA from 0.007 to 0.005 https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* Lowered maximal SOR for TNscope in TGA tumor only cases from 3 to 2.7 https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* Fixed TNscope research VCF filters to either PASS or triallelic site https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 Removed: ^^^^^^^^ @@ -29,6 +40,9 @@ Removed: * `gatk_contest` rule https://github.com/Clinical-Genomics/BALSAMIC/pull/1432 * SGE (qsub) support https://github.com/Clinical-Genomics/BALSAMIC/pull/1372 * Fastq quality and UMI trimming command-line options https://github.com/Clinical-Genomics/BALSAMIC/pull/1358 +* ML model for TNscope https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* All code associated with TNhaplotyper https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 +* Removed research.filtered.pass files from delivery https://github.com/Clinical-Genomics/BALSAMIC/pull/1475 Fixed: ^^^^^^ diff --git a/docs/balsamic_filters.rst b/docs/balsamic_filters.rst index 676b1a671..40e5bb071 100644 --- a/docs/balsamic_filters.rst +++ b/docs/balsamic_filters.rst @@ -2,7 +2,7 @@ Calling and filtering variants *********************************** -In BALSAMIC, various bioinfo tools are integrated for reporting somatic and germline variants summarized in the table below. The choice of these tools differs between the type of analysis; `Whole Genome Sequencing (WGS)`, or `Target Genome Analysis (TGA)` and `Target Genome Analysis (TGA) with UMI-analysis activated`. +In BALSAMIC, various bioinfo tools are integrated for reporting somatic and germline variants summarized in the table below. The choice of these tools differs between the type of analysis; `Whole Genome Sequencing (WGS)`, or `Target Genome Analysis (TGA)` and `Target Genome Analysis (TGA) with UMI 3,1,1 filtering activated`. .. list-table:: SNV and small-Indel callers @@ -20,7 +20,7 @@ In BALSAMIC, various bioinfo tools are integrated for reporting somatic and germ - germline - SNV, InDel * - TNscope - - WGS, TGA (with UMIs activated) + - WGS, TGA, TGA with UMI 3,1,1 filtering applied - tumor-normal, tumor-only - somatic - SNV, InDel @@ -35,43 +35,46 @@ Various filters (Pre-call and Post-call filtering) are applied at different leve **Pre-call filtering** is where the variant-calling tool decides not to add a variant to the VCF file if the default filters of the variant-caller did not pass the filter criteria. The set of default filters differs between the various variant-calling algorithms. -To know more about the pre-call filters used by the variant callers, please have a look at the VCF header of the particular variant-calling results. +To know more about the pre-call filters and detailed arguments used by the variant callers, please have a look at the VCF header of the particular variant-calling results. For example: .. figure:: images/vcf_filters.png :width: 500px - Pre-call filters applied by the `Vardict` variant-caller is listed in the VCF header. + Pre-call filters applied by the `TNscope` variant-caller is listed in the VCF header. In the VCF file, the `FILTER` status is `PASS` if this position has passed all filters, i.e., a call is made at this position. Contrary, -if the site has not passed any of the filters, a semicolon-separated list of those failed filter(s) will be appended to the `FILTER` column instead of `PASS`. E.g., `p8;pSTD` might -indicate that at this site, the mean position in reads is less than 8, and the position in reads has a standard deviation of 0. +if the site has not passed any of the filters, a semicolon-separated list of those failed filter(s) will be appended to the `FILTER` column instead of `PASS`. E.g., `t_lod_fstar;alt_allele_in_normal` might +indicate that at this site there is little support that the alternative bases constitute a true somatic variant, and there is also evidence of those same bases in the normal sample. **Note:** -**In BALSAMIC, this VCF file is named as `*..vcf.gz` (eg: `SNV.somatic..vardict..vcf.gz`)** - - +**In BALSAMIC, this VCF file is often referred to the "raw" VCF because it is the most unfiltered VCF produced, and is named `SNV.somatic..tnscope.vcf.gz`** .. figure:: images/filter_status.png :width: 500px - Vardict Variant calls with different 'FILTER' status underlined in white line (`NM4.5`, `PASS`, `p8;pSTD`) + TNscope Variant calls with different 'FILTER' status underlined in red line (`PASS`, `t_lod_fstar`, `alt_allele_in_normal`) + +**Post-call quality filtering** is where a variant is further filtered based on quality, depth, VAF, etc., with more stringent thresholds. -**Post-call filtering** is where a variant is further filtered with quality, depth, VAF, etc., with more stringent thresholds. -For `Post-call filtering`, in BALSAMIC we have applied various filtering criteria (`Vardict_filtering`_, `TNscope filtering (Tumor_normal)`_ ) depending on the analysis-type (TGS/WGS) and sample-type(tumor-only/tumor-normal). +For `Post-call filtering`, in BALSAMIC we have applied various filtering criteria depending on the analysis-type (TGS/WGS) and sample-type (tumor-only/tumor-normal), and only variants with either `PASS` or `triallelic_site` are kept. **Note:** -**In BALSAMIC, this VCF file is named as `*..filtered.vcf.gz` (eg: `SNV.somatic..vardict..filtered.vcf.gz`)** +**In BALSAMIC, this VCF file is named as SNV.somatic..research.tnscope.vcf.gz and is not delivered** +**Post-call variant-database filtering** is where a variant is further filtered based on their presence in certain variant-databases such as Gnomad and local variant databases built with LoqusDB. -Only those variants that fulfill the pre-call and post-call filters are scored as `PASS` in the `STATUS` column of the VCF file. We filter those `PASS` variants and deliver a final list of variants to the customer either via `Scout` or `Caesar` +This is a two step process where variants are first filtered based on existing above a specified frequency in public available databases, and then based on local databases of previously observed variants. + +At each step only variants with filters `PASS` and `triallelic_site` are kept and delivered as a final list of variants to the customer either via `Scout` or `Caesar` **Note:** -**In BALSAMIC, this VCF file is named as `*..filtered.pass.vcf.gz` (eg: `SNV.somatic..vardict..filtered.pass.vcf.gz`)** +**In BALSAMIC, the VCF file filtered only on public available databases is named as `*.research.filtered.pass.vcf.gz` (eg: for WGS `SNV.somatic..tnscope.research.filtered.pass.vcf.gz`)** +**In BALSAMIC, the VCF file filtered on both public and locally available databases is named as `*.clinical.filtered.pass.vcf.gz` (eg: for TGA `SNV.somatic..merged..filtered.pass.vcf.gz`)** .. list-table:: Description of VCF files :widths: 30 50 20 @@ -81,13 +84,13 @@ Only those variants that fulfill the pre-call and post-call filters are scored a - Description - Delivered to the customer * - .vcf.gz - - Unannotated VCF file with pre-call filters included in the STATUS column + - Unannotated raw VCF file with pre-call filters included in the STATUS column + - Yes (Caesar) + * - .research.filtered.pass.vcf.gz + - Annotated VCF file with quality and population based filters applied. - Yes (Caesar) - * - ..vcf.gz - - Annotated VCF file with pre-call filters included in the STATUS column - - No - * - ..filtered.pass.vcf.gz - - Annotated and filtered VCF file by excluding all filters that did not meet the pre and post-call filter criteria. Includes only variants with the `PASS` STATUS + * - .clinical.filtered.pass.vcf.gz + - Annotated VCF file with quality, population and local database filters applied. - Yes (Caesar and Scout) @@ -97,13 +100,14 @@ Only those variants that fulfill the pre-call and post-call filters are scored a Somatic Callers for reporting SNVs/INDELS ****************************************** + +For SNV/InDel calling in the TGA analyses of balsamic both VarDict and TNscope are used. Lists of variants are produced from both tools, which are then normalised and quality filtered before being merged. + + **Vardict** =========== `Vardict `_ is a sensitive variant caller used for both tumor-only and tumor-normal variant calling. -The results of `Vardict` variant calling are further post-filtered based on several criteria (`Vardict_filtering`_) to retrieve high-confidence variant calls. -These high-confidence variant calls are the final list of variants uploaded to Scout or available in the delivered VCF file in Caesar. - There are two slightly different post-processing filters activated depending on if the sample is an exome or a smaller panel as these tend to have very different sequencing depths. @@ -124,7 +128,7 @@ Following is the set of criteria applied for filtering vardict results. It is us :: - DP >= 100 + DP >= 50 *Variant depth (VD)*: Total reads supporting the ALT allele @@ -136,7 +140,7 @@ Following is the set of criteria applied for filtering vardict results. It is us :: - Minimum AF >= 0.007 + Minimum AF >= 0.005 **Post-call Quality Filters for exomes** @@ -163,13 +167,142 @@ Following is the set of criteria applied for filtering vardict results. It is us :: - Minimum AF >= 0.007 + Minimum AF >= 0.005 + + +**Attention:** +**BALSAMIC <= v8.2.7 uses minimum AF 1% (0.01). From Balsamic v8.2.8, minimum VAF is changed to 0.7% (0.007). From v16.0.0 minimum VAF is changed to 0.5% (0.005).** + +**For normal matched analyses** + +*Relative tumor AF in normal*: Allows for maximum Tumor-In-Normal-Contamination of 30%. + +:: + + excludes variant if: AF(normal) / AF(tumor) > 0.3 **Note:** **Additionally, the variant is excluded for tumor-normal cases if marked as 'germline' in the `STATUS` column of the VCF file.** -**Attention:** -**BALSAMIC <= v8.2.7 uses minimum AF 1% (0.01). From Balsamic v8.2.8, minimum VAF is changed to 0.7% (0.007)** + +**Sentieon's TNscope** +======================= + +The `TNscope `_ algorithm performs the somatic variant calling on the tumor-normal or the tumor-only samples. + +**TNscope filtering** +^^^^^^^^^^^^^^^^^^^^^^ + +**Pre-call Filters** + +*min_init_tumor_lod*: Initial Log odds for the that the variant exists in the tumor. + +:: + + min_init_tumor_lod = 0.5 + + +*min_tumor_lod*: Minimum log odds in the final call of variant in the tumor. + +:: + + min_tumor_lod = 4 + + +*min_init_normal_lod*: Initial Log odds for the that the variant exists in the normal. + +:: + + min_init_normal_lod = 0.5 + + +*min_normal_lod*: Minimum log odds in the final call of variant in the normal. + +:: + + min_normal_lod = 2.2 + +*min_dbnp_normal_lod*: Minimum normalLOD at dbSNP site. + +:: + + min_dbnp_normal_lod = 5.5 + +*max_error_per_read*: Maximum number of differences to reference per read. + +:: + + max_error_per_read = 5 + +*min_base_qual*: Minimal base quality to consider in calling + +:: + min_base_qual = 15 + +*min_tumor_allele_frac*: Set the minimum tumor AF to be considered as potential variant site. + +:: + + min_tumor_allele_frac = 0.0005 + +*interval_padding*: Adding an extra 100bp to each end of the target region in the bed file before variant calling. + +:: + + interval_padding = 100 + +**Post-call Filters** + +*Total Depth (DP)*: Refers to the overall read depth supporting the called variant. + +:: + + DP >= 50 + +*Variant depth (VD)*: Total reads supporting the ALT allele + +:: + + VD >= 5 + +*Allelic Frequency (AF)*: Fraction of the reads supporting the alternate allele + +:: + + Minimum AF >= 0.005 + + +**For tumor only analyses** + +*Average base quality score* + +:: + + SUM(QSS)/SUM(AD) >= 20 + +*SOR*: Symmetric Odds Ratio of 2x2 contingency table to detect strand bias + +:: + + SOR < 2.7 + +**Note:** +**Additionally, variants labeled with triallelic site filter are not filtered out** + +**For normal matched analyses** + +*alt_allele_in_normal*: Default filter set by TNscope was considered too stringent in filtering tumor in normal and is removed. + +:: + + bcftools annotate -x FILTER/alt_allele_in_normal + + +*Relative tumor AF in normal*: Allows for maximum Tumor-In-Normal-Contamination of 30%. + +:: + + excludes variant if: AF(normal) / AF(tumor) > 0.3 **Post-call Observation database Filters** @@ -211,6 +344,8 @@ The following filter applies for both tumor-normal and tumor-only samples. minreads = 3,1,1 It means that at least `3` read-pairs need to support the UMI-group (based on the UMI-tag and the aligned genomic positions), and with at least 1 read-pair from each strand (F1R2 and F2R1). +**NOTE:** This filtering is performed on the bamfile before variant calling. + *min_init_tumor_lod*: Initial Log odds for the that the variant exists in the tumor. @@ -218,11 +353,43 @@ It means that at least `3` read-pairs need to support the UMI-group (based on th min_init_tumor_lod = 0.5 + *min_tumor_lod*: Minimum log odds in the final call of variant in the tumor. :: - min_tumor_lod = 4.0 + min_tumor_lod = 4 + + +*min_init_normal_lod*: Initial Log odds for the that the variant exists in the normal. + +:: + + min_init_normal_lod = 0.5 + + +*min_normal_lod*: Minimum log odds in the final call of variant in the normal. + +:: + + min_normal_lod = 2.2 + +*min_dbnp_normal_lod*: Minimum normalLOD at dbSNP site. + +:: + + min_dbnp_normal_lod = 5.5 + +*max_error_per_read*: Maximum number of differences to reference per read. + +:: + + max_error_per_read = 5 + +*min_base_qual*: Minimal base quality to consider in calling + +:: + min_base_qual = 15 *min_tumor_allele_frac*: Set the minimum tumor AF to be considered as potential variant site. @@ -236,6 +403,7 @@ It means that at least `3` read-pairs need to support the UMI-group (based on th interval_padding = 100 + **Post-call Quality Filters** *alt_allele_in_normal*: Default filter set by TNscope was considered too stringent in filtering tumor in normal and is removed. @@ -244,7 +412,6 @@ It means that at least `3` read-pairs need to support the UMI-group (based on th bcftools annotate -x FILTER/alt_allele_in_normal - *Relative tumor AF in normal*: Allows for maximum Tumor-In-Normal-Contamination of 30%. :: @@ -457,5 +624,5 @@ The variants scored as `PASS` or `triallelic_sites` are included in the final vc **BALSAMIC <= v8.2.10 uses GNOMAD_popmax <= 0.005. From Balsamic v9.0.0, this settings is changed to 0.02, to reduce the stringency.** **BALSAMIC >= v11.0.0 removes unmapped reads from the bam and cram files for all the workflows.** **BALSAMIC >= v13.0.0 keeps unmapped reads in bam and cram files for all the workflows.** - +**BALSAMIC >= v16.0.0 uses UMIs for duplicate removal bam in standard TGA workflows.** diff --git a/docs/balsamic_methods.rst b/docs/balsamic_methods.rst index 4a2f2bd87..d4b206208 100644 --- a/docs/balsamic_methods.rst +++ b/docs/balsamic_methods.rst @@ -7,13 +7,13 @@ Target Genome Analysis BALSAMIC :superscript:`1` (**version** = 15.0.1) was used to analyze the data from raw FASTQ files. We first quality controlled FASTQ files using FastQC v0.11.9 :superscript:`2`. -Adapter sequences are trimmed using fastp v0.23.2 :superscript:`3` and then UMI sequences are extracted using the UMI extract tool from sentieon-tools 202308.03 :superscript:`15` and finally low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. -Trimmed reads were mapped to the reference genome hg19 using sentieon-tools 202308.03 :superscript:`15`. -The resulting BAM is quality controlled using samtools v1.15.1 :superscript:`26`, and AlignmentStat, InsertSizeMetricAlgo, GCBias, MeanQualityByCycle, QualDistribution and BaseDistributionByCycle from sentieon-tools 202308.03 :superscript:`15`. -Duplicate reads are collapsed using the UMI function in Dedup from sentieon-tools 202308.03 :superscript:`15`, and invalid mates are corrected with tools collate and fixmate from samtools v1.15.1 :superscript:`26`. +Adapter sequences are trimmed using fastp v0.23.2 :superscript:`3` and then UMI sequences are extracted using the UMI extract tool from sentieon-tools (version 202308.03) :superscript:`15` and finally low-quality bases were trimmed using fastp v0.23.2 :superscript:`3`. +Trimmed reads were mapped to the reference genome hg19 using sentieon-tools :superscript:`15`. +The resulting BAM is quality controlled using samtools v1.15.1 :superscript:`26`, and AlignmentStat, InsertSizeMetricAlgo, GCBias, MeanQualityByCycle, QualDistribution and BaseDistributionByCycle from sentieon-tools :superscript:`15`. +Duplicate reads are collapsed using the UMI function in Dedup from sentieon-tools :superscript:`15`, and invalid mates are corrected with tools collate and fixmate from samtools v1.15.1 :superscript:`26`. Coverage metrics are collected from the final BAM using Sambamba v0.8.2 :superscript:`27`, Mosdepth v0.3.3 :superscript:`28` and CollectHsMetrics from Picard tools v2.27.1 :superscript:`6`. Results of the quality controlled steps were summarized by MultiQC v1.22.3 :superscript:`7`. -Small somatic mutations (SNVs and INDELs) were called for each sample using VarDict 2019.06.04 :superscript:`8` and filtered using the criteria (*MQ >= 30, DP >= 100 (20 for exome-samples), VD >= 5, Minimum AF >= 0.007, Maximum AF < 1, GNOMADAF_popmax <= 0.005, swegen AF < 0.01*). +Small somatic mutations (SNVs and INDELs) were called for each sample using VarDict 2019.06.04 :superscript:`8` and Sentieon TNscope :superscript:`16` and merged using bcftools norm and concat, and filtered using the criteria (*MQ >= 30, DP >= 50 (20 for exome-samples), VD >= 5, Minimum AF >= 0.005, Maximum AF < 1, GNOMADAF_popmax <= 0.005, swegen AF < 0.01*). Only those variants that fulfilled the filtering criteria and scored as `PASS` in the VCF file were reported. Structural variants (SV) were called using Manta v1.6.0 :superscript:`9` on a post-processed version of the BAM which was base-quality capped to 70, and Delly v1.0.3 :superscript:`10`. Copy number variations (CNV) were called using CNVkit v0.9.10 :superscript:`11`. diff --git a/docs/balsamic_pon.rst b/docs/balsamic_pon.rst index 4b11eaa8e..201d7714e 100644 --- a/docs/balsamic_pon.rst +++ b/docs/balsamic_pon.rst @@ -7,6 +7,10 @@ Currently two PON-methods are implemented in BALSAMIC to correct for biases and - To produce normalised CN-profiles for WGS cases visualised in ``GENS``. +Sharing PON for publications +====================== + +If a PON has been used for the analysis of samples in a research project and a publication requires that the PON is uploaded to some database, a request can be made to Clinical Genomics, and depending on the status of the consent of the individuals from which the samples used in the construction of the PON has been derived it may or may not be possible to share the PON. CNVkit PON ====================== diff --git a/docs/balsamic_sv_cnv.rst b/docs/balsamic_sv_cnv.rst index f3598792b..c2b17b893 100644 --- a/docs/balsamic_sv_cnv.rst +++ b/docs/balsamic_sv_cnv.rst @@ -50,13 +50,31 @@ Depending on the sequencing type, BALSAMIC is currently running the following st - somatic - SV + +**Note:** igh_dux4 is not a variant caller itself. This is a custom script that uses samtools to detect read pairs supporting IGH::DUX4 rearrangements. + +It is mandatory to provide the gender of the sample from BALSAMIC version >= 10.0.0 For CNV analysis. + Further details about a specific caller can be found in the links for the repositories containing the documentation for SV and CNV callers along with the links for the articles are listed in `bioinfo softwares `_. -Note that igh_dux4 is not a variant caller itself. This is a custom script that uses samtools to detect read pairs supporting IGH::DUX4 rearrangements. In short, the command identifies discordant reads mapping to the IGH region and to either DUX4 or its homologous DUX4-like regions (see references for details). The inclusion of this feature aims to alleviate the failure of callers to detect this rearrangement. It is important to note, however, that the reported breakpoints are fixed to the IGH and DUX4 coordinates and are, therefore, imprecise and uncertain. Therefore, we advise caution when interpreting this information. +**Difficult to detect clinically relevant SVs** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +**IGH::DUX4 rearrangements** -It is mandatory to provide the gender of the sample from BALSAMIC version >= 10.0.0 For CNV analysis. +This is a custom script that uses samtools to detect read pairs supporting IGH::DUX4 rearrangements. In short, the command identifies discordant reads mapping to the IGH region and to either DUX4 or its homologous DUX4-like regions (see references for details). The inclusion of this feature aims to alleviate the failure of callers to detect this rearrangement. + +It is important to note, however, that the reported breakpoints are fixed to the IGH and DUX4 coordinates and are, therefore, imprecise and uncertain. Therefore, we advise caution when interpreting this information. + +The script used to detect this rearrangements can be found in: BALSAMIC/assets/scripts/igh_dux4_detection.sh (see `references `_ Detection of IGH::DUX4 rearrangement, for more information.) + +**FLT3-ITDs** + +FLT3 Internal Tandem Duplications are quite difficult to detect and can exist in sizes detectable by both small SNV and InDel callers such as VarDit and TNscope as well as Structural Variant callers like Manta, Delly and TIDDIT. In our experience however the variant is more commonly detected by SV callers and we strongly recommend that this variant is looked for in the SV results. + +It has also been observed in release 16.0.0 of Balsamic a decline in the ability of SNV callers to detect this variant, possibly as a result of collapsing overlapping mates of read-pairs into singletons as a result of UMI post processing. This adds further reasons to searching for this variant in the SV results! +In the future we will aim to add further callers better able to detect this variant. **Pre-merge Filtrations** diff --git a/docs/bioinfo_softwares.rst b/docs/bioinfo_softwares.rst index c49b34844..cab9f1ed1 100644 --- a/docs/bioinfo_softwares.rst +++ b/docs/bioinfo_softwares.rst @@ -122,7 +122,7 @@ sentieon-tools ~~~~~~~~~~~~~~ :Source code: `Commercial Tool` ``_ :Article: `Bioinformatics` ``_ -:Version: `202010.02` +:Version: `202308.03` somalier ~~~~~~~~ diff --git a/docs/images/.DS_Store b/docs/images/.DS_Store new file mode 100644 index 000000000..e28887000 Binary files /dev/null and b/docs/images/.DS_Store differ diff --git a/docs/images/filter_status.png b/docs/images/filter_status.png index 345275735..2f22d9c61 100644 Binary files a/docs/images/filter_status.png and b/docs/images/filter_status.png differ diff --git a/docs/images/vcf_filters.png b/docs/images/vcf_filters.png index 5937a8998..dda315acc 100644 Binary files a/docs/images/vcf_filters.png and b/docs/images/vcf_filters.png differ diff --git a/herey b/herey deleted file mode 100644 index 9004fd246..000000000 --- a/herey +++ /dev/null @@ -1,15 +0,0 @@ -o_1 834184 834185 -0.015159 -o_1 1718821 1718822 -0.120548 -o_1 1720598 1720599 -0.213593 -a_1 834184 834185 -0.015159 -a_1 1718821 1718822 -0.120548 -a_1 1720598 1720599 -0.213593 -b_1 834184 834185 -0.015159 -b_1 1718821 1718822 -0.120548 -b_1 1720598 1720599 -0.213593 -c_1 834184 834185 -0.015159 -c_1 1718821 1718822 -0.120548 -c_1 1720598 1720599 -0.213593 -d_1 834184 834185 -0.015159 -d_1 1718821 1718822 -0.120548 -d_1 1720598 1720599 -0.213593 diff --git a/tests/conftest.py b/tests/conftest.py index 0b31397cc..fdbcca19b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -932,7 +932,7 @@ def fastq_dir_tumor_only_dummy_vep( vep_dir: Path = Path(analysis_dir, case_id_tumor_only_dummy_vep, "analysis", "vep") vep_dir.mkdir(parents=True, exist_ok=True) vep_test_file = ( - "SNV.somatic.sample_tumor_only.vardict.research.filtered.pass.vcf.gz" + "SNV.somatic.sample_tumor_only.tnscope.research.filtered.pass.vcf.gz" ) Path(vep_dir, vep_test_file).touch() @@ -2006,10 +2006,10 @@ def qc_extracted_metrics(metrics_yaml_path: str) -> dict: @pytest.fixture(scope="function") -def snakemake_bcftools_filter_vardict_research_tumor_only( +def snakemake_bcftools_filter_tnscope_research_tumor_only( tumor_only_config_dummy_vep, helpers ): - """bcftools_filter_vardict_research_tumor_only snakemake mock rule""" + """bcftools_filter_tnscope_research_tumor_only snakemake mock rule""" helpers.read_config(tumor_only_config_dummy_vep) vep_path = os.path.join( @@ -2017,11 +2017,11 @@ def snakemake_bcftools_filter_vardict_research_tumor_only( helpers.case_id, "analysis", "vep", - "{var_type}.somatic.{case_name}.vardict.research.filtered.pass.vcf.gz", + "{var_type}.somatic.{case_name}.tnscope.research.filtered.pass.vcf.gz", ) return Map( { - "bcftools_filter_vardict_research_tumor_only": Map( + "bcftools_filter_tnscope_research_tumor_only": Map( { "params": Map( { @@ -2033,13 +2033,13 @@ def snakemake_bcftools_filter_vardict_research_tumor_only( ), "output": Map( { - "_names": Map({"vcf_pass_vardict": vep_path}), - "vcf_pass_vardict": vep_path, + "_names": Map({"vcf_pass_tnscope": vep_path}), + "vcf_pass_tnscope": vep_path, } ), "rule": Map( { - "name": "bcftools_filter_vardict_research_tumor_only", + "name": "bcftools_filter_tnscope_research_tumor_only", "output": [ vep_path, ], @@ -2420,7 +2420,7 @@ def fixture_snakemake_executable_data( "case_id": case_id_tumor_only, "cluster_config_path": reference_file, "config_path": reference_file, - "disable_variant_caller": "tnscope,vardict", + "disable_variant_caller": "tnscope", "log_dir": session_tmp_path, "mail_user": mail_user_option, "profile": ClusterProfile.SLURM, @@ -2460,7 +2460,7 @@ def fixture_snakemake_executable_validated_data( "case_id": case_id_tumor_only, "cluster_config_path": reference_file, "config_path": reference_file, - "disable_variant_caller": "disable_variant_caller=tnscope,vardict", + "disable_variant_caller": "disable_variant_caller=tnscope", "dragen": False, "force": False, "log_dir": session_tmp_path, diff --git a/tests/models/test_params_models.py b/tests/models/test_params_models.py index 0bfbfd855..94ac522a1 100644 --- a/tests/models/test_params_models.py +++ b/tests/models/test_params_models.py @@ -11,7 +11,6 @@ from BALSAMIC.models.params import ( ParamsManta, ParamsSentieonWGSMetrics, - ParamsVardict, ParamsVEP, QCModel, UMIParamsCommon, @@ -84,31 +83,10 @@ def test_get_manta_settings_wgs(): assert manta_settings == "" -def test_params_vardict(): - """test UMIParamsVardict model for correct validation""" - - # GIVEN vardict params - test_vardict_params = { - "allelic_frequency": 0.01, - "max_pval": 0.5, - "max_mm": 2, - "column_info": "-a 1 -b 2 -c 3", - } - - # WHEN building the model - test_vardict_built = ParamsVardict(**test_vardict_params) - - # THEN assert values - assert isclose(test_vardict_built.allelic_frequency, 0.01) - assert isclose(test_vardict_built.max_pval, 0.5) - assert test_vardict_built.max_mm == 2 - assert test_vardict_built.column_info == "-a 1 -b 2 -c 3" - - def test_params_vep(): """test UMIParamsVEP model for correct validation""" - # GIVEN vardict params + # GIVEN params test_vep = {"vep_filters": "all defaults params"} # WHEN building the model @@ -195,12 +173,10 @@ def test_umiparams_common(): # GIVEN a UMI workflow common params test_commonparams = { "align_intbases": 100, - "filter_tumor_af": 0.01, } # WHEN building the model test_commonparams_built = UMIParamsCommon(**test_commonparams) # THEN assert values - assert isclose(test_commonparams_built.filter_tumor_af, 0.01) assert test_commonparams_built.align_intbases == 100 @@ -243,10 +219,12 @@ def test_umiparams_tnscope(): "algo": "algoname", "init_tumorLOD": 0.5, "min_tumorLOD": 6, + "filter_tumor_af": 0.01, "error_rate": 5, "prunefactor": 3, "padding": 30, "disable_detect": "abc", + "pcr_model": "NONE", } # WHEN building the model @@ -256,7 +234,9 @@ def test_umiparams_tnscope(): assert test_tnscope_params_built.algo == "algoname" assert isclose(test_tnscope_params_built.init_tumorLOD, 0.5) assert test_tnscope_params_built.min_tumorLOD == 6 + assert isclose(test_tnscope_params_built.filter_tumor_af, 0.01, rel_tol=1e-9) assert test_tnscope_params_built.error_rate == 5 assert test_tnscope_params_built.prunefactor == 3 assert test_tnscope_params_built.disable_detect == "abc" + assert test_tnscope_params_built.pcr_model == "NONE" assert test_tnscope_params_built.padding == 30 diff --git a/tests/models/test_snakemake_models.py b/tests/models/test_snakemake_models.py index d47b06f93..373763bb2 100644 --- a/tests/models/test_snakemake_models.py +++ b/tests/models/test_snakemake_models.py @@ -79,7 +79,7 @@ def test_get_config_options(snakemake_executable: SnakemakeExecutable): snakemake_config_options: str = snakemake_executable.get_config_options() # THEN the expected format should be returned - assert snakemake_config_options == "--config disable_variant_caller=tnscope,vardict" + assert snakemake_config_options == "--config disable_variant_caller=tnscope" def test_get_dragen_flag(snakemake_executable: SnakemakeExecutable): @@ -267,5 +267,5 @@ def test_get_snakemake_command( f"--cluster-config {reference_file.as_posix()} --cluster '{sys.executable} {IMMEDIATE_SUBMIT_PATH.as_posix()} " f"--account {ClusterAccount.DEVELOPMENT} --log-dir {session_tmp_path} --mail-user {mail_user_option} " f"--profile {ClusterProfile.SLURM} --qos {QOS.HIGH} --script-dir {session_tmp_path} {case_id_tumor_only} " - "{dependencies}' --config disable_variant_caller=tnscope,vardict --cores 36" + "{dependencies}' --config disable_variant_caller=tnscope --cores 36" ) diff --git a/tests/test_data/config.json b/tests/test_data/config.json index eee596d93..dc888eb5c 100644 --- a/tests/test_data/config.json +++ b/tests/test_data/config.json @@ -5,6 +5,7 @@ "n_base_limit": "50" }, "vcf": { + "vardict": { "mutation": "somatic", "mutation_type": "SNV", @@ -12,6 +13,13 @@ "sequencing_type": ["targeted"], "workflow_solution": ["BALSAMIC"] }, + "merged": { + "mutation": "somatic", + "mutation_type": "SNV", + "analysis_type": ["paired", "single"], + "sequencing_type": ["targeted"], + "workflow_solution": ["BALSAMIC"] + }, "tnscope": { "mutation": "somatic", "mutation_type": "SNV", diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 3dfde35b0..72d252c52 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -354,7 +354,7 @@ def test_get_snakefile(): def test_get_vcf(sample_config: Dict): # GIVEN a sample_config dict and a variant callers list - variant_callers = ["tnscope", "vardict", "manta"] + variant_callers = ["tnscope", "manta"] # WHEN passing args to that function vcf_list = get_vcf( @@ -363,13 +363,12 @@ def test_get_vcf(sample_config: Dict): # THEN It should return the list of vcf file names assert any("tnscope" in vcf_name for vcf_name in vcf_list) - assert any("vardict" in vcf_name for vcf_name in vcf_list) assert any("manta" in vcf_name for vcf_name in vcf_list) def test_get_vcf_invalid_variant_caller(sample_config): # GIVEN a sample_config dict and an incorrect variant callers list - variant_callers = ["vardict", "manta", "tnhaplotyper"] + variant_callers = ["manta", "tnhaplotyper"] # WHEN passing args to that function with pytest.raises(KeyError): @@ -476,6 +475,7 @@ def test_write_list_of_strings(tmp_path): read_written_file: list[Dict] = read_csv( csv_path=output_file.as_posix(), delimeter="\t" ) + assert read_written_file == [ {"header1": "row1_col1", "header2": "row1_col2", "header3": "row1_col3"} ] @@ -780,12 +780,12 @@ def test_get_sample_type_from_sample_name(config_dict: Dict): assert sample_type == SampleType.TUMOR -def test_get_rule_output(snakemake_bcftools_filter_vardict_research_tumor_only): +def test_get_rule_output(snakemake_bcftools_filter_tnscope_research_tumor_only): """Tests retrieval of existing output files from a specific workflow.""" # GIVEN a snakemake fastqc rule object, a rule name and a list of associated wildcards - rules = snakemake_bcftools_filter_vardict_research_tumor_only - rule_name = "bcftools_filter_vardict_research_tumor_only" + rules = snakemake_bcftools_filter_tnscope_research_tumor_only + rule_name = "bcftools_filter_tnscope_research_tumor_only" output_file_wildcards = { "sample": ["ACC1", "tumor", "normal"], "case_name": "sample_tumor_only", @@ -794,18 +794,18 @@ def test_get_rule_output(snakemake_bcftools_filter_vardict_research_tumor_only): # THEN retrieve the output files output_files = get_rule_output(rules, rule_name, output_file_wildcards) - # THEN check that the fastq files has been picked up by the function and that the tags has been correctly created + # THEN check that the output files and tags are correctly retrieved assert len(output_files) == 1 for file in output_files: # Expected file names assert ( Path(file[0]).name - == "SNV.somatic.sample_tumor_only.vardict.research.filtered.pass.vcf.gz" + == "SNV.somatic.sample_tumor_only.tnscope.research.filtered.pass.vcf.gz" ) # Expected tags assert ( file[3] - == "SNV,sample-tumor-only,vcf-pass-vardict,research-vcf-pass-vardict" + == "SNV,sample-tumor-only,vcf-pass-tnscope,research-vcf-pass-tnscope" )