feat: merge vardict and tnscope (#1475)

#### Changed - Refactored rules for bcftools filters - Renamed final UMI bamfile to ensure hsmetrics are collected in multiqc json - Changed ranked VCF from research to clincial - Lowered min AF for TGA from 0.007 to 0.005 - Lowered maximal SOR for TNscope in TGA tumor only cases from 3 to 2.7 - Changed filter settings for research TNscope vcf, now either PASS or triallelic_site (fixing this issue: #1293) #### Added - TNscope for TGA workflows, merged with VarDict results - New filter for VarDict for tumor in normal contamination - Export TMP environment variables to rules that lack them - Added genmod ranked VCFs to be delivered - Added family-id to genmod in order to get ranked variants to Scout (solved this: #1045) - Added DP and AF to INFO-field of TNscope vcfs for ranking model - Raw TNscope calls and unfiltered research-annotated SNVs to delivery #### Removed - ML-model for TNscope is removed due to license issue with new version of Sentieon - All code associated with TNhaplotyper - Removed research.filtered.pass VCFs from delivery and storage list
Clinical-Genomics · Oct 14, 2024 · 57bdbe0 · 57bdbe0
1 parent 6890945
commit 57bdbe0
Show file tree

Hide file tree

Showing 65 changed files with 2,031 additions and 1,364 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.github/workflows/black_linter.yml b/.github/workflows/black_linter.yml
@@ -11,4 +11,4 @@ jobs:
       - uses: psf/black@stable
         with:
           options: "--check --verbose"
-          version: "22.3.0"
+          version: "23.7.0"
diff --git a/BALSAMIC/assets/scripts/modify_tnscope_infofield.py b/BALSAMIC/assets/scripts/modify_tnscope_infofield.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+import vcfpy
+import click
+import sys
+import logging
+from typing import List, Optional
+
+LOG = logging.getLogger(__name__)
+
+
+def summarize_ad_to_dp(ad_list):
+    """
+    Summarizes the AD (allelic depth) field into total DP (read depth).
+
+    Parameters:
+    ad_list (list): List of read depths supporting each allele, [ref_depth, alt1_depth, alt2_depth, ...]
+
+    Returns:
+    int: Total read depth (DP) across all alleles.
+    """
+    if ad_list is None:
+        return 0  # Return 0 if AD field is not present
+    return sum(ad_list)
+
+
+@click.command()
+@click.argument("input_vcf", type=click.Path(exists=True))
+@click.argument("output_vcf", type=click.Path())
+def process_vcf(input_vcf: str, output_vcf: str):
+    """
+    Processes the input VCF file and writes the updated information to the output VCF file.
+
+    INPUT_VCF: Path to the input VCF file.
+    OUTPUT_VCF: Path to the output VCF file.
+    """
+
+    # Open the input VCF file
+    reader: vcfpy.Reader = vcfpy.Reader.from_path(input_vcf)
+
+    # Ensure the sample name is 'TUMOR'
+    sample_name: str = reader.header.samples.names[0]
+    if sample_name != "TUMOR":
+        LOG.warning(
+            f"Error: The first sample is named '{sample_name}', but 'TUMOR' is expected."
+        )
+        sys.exit(1)
+
+    # Add AF and DP fields to the header if not already present
+    if "AF" not in reader.header.info_ids():
+        reader.header.add_info_line(
+            vcfpy.OrderedDict(
+                [
+                    ("ID", "AF"),
+                    ("Number", "A"),
+                    ("Type", "Float"),
+                    ("Description", "Allele Frequency"),
+                ]
+            )
+        )
+
+    if "DP" not in reader.header.info_ids():
+        reader.header.add_info_line(
+            vcfpy.OrderedDict(
+                [
+                    ("ID", "DP"),
+                    ("Number", "1"),
+                    ("Type", "Integer"),
+                    ("Description", "Total Depth"),
+                ]
+            )
+        )
+
+    # Open the output VCF file for writing
+    with vcfpy.Writer.from_path(output_vcf, reader.header) as writer:
+        # Loop through each record (variant)
+        for record in reader:
+            # Get the TUMOR sample data
+            sample_index: int = reader.header.samples.names.index(sample_name)
+            tumor_call: vcfpy.Call = record.calls[sample_index]
+
+            # Check and process AD field
+            tumor_ad: Optional[List[int]] = tumor_call.data.get(
+                "AD", None
+            )  # AD is a list [ref_count, alt_count]
+            if tumor_ad is None:
+                LOG.warning(
+                    f"Warning: AD field is missing for record at position {record.POS} on {record.CHROM}"
+                )
+            else:
+                record.INFO["DP"] = summarize_ad_to_dp(tumor_ad)
+
+            # Check and process AF field
+            tumor_af: Optional[float] = tumor_call.data.get("AF", None)
+            if tumor_af is None:
+                LOG.warning(
+                    f"Warning: AF field is missing for record at position {record.POS} on {record.CHROM}"
+                )
+                record.INFO["AF"] = [0.0]  # Default AF to 0.0 if missing
+            else:
+                record.INFO["AF"] = [tumor_af]  # Wrap AF in a list
+
+            # Write the updated record to the output VCF file
+            writer.write_record(record)
+
+    click.echo(f"VCF file processed and saved to {output_vcf}")
+
+
+if __name__ == "__main__":
+    process_vcf()
diff --git a/BALSAMIC/assets/scripts/sort_vcf.awk b/BALSAMIC/assets/scripts/sort_vcf.awk
@@ -0,0 +1,16 @@
+#!/usr/bin/awk -f
+
+BEGIN {
+    ENVIRON["LC_ALL"] = "en_US.UTF-8"
+}
+
+# If the line starts with a '#', it's a header, so print it as is
+$1 ~ /^#/ {
+    print $0;
+    next;
+}
+
+# Otherwise, send the body lines to an external sort command
+{
+    print $0 | "/usr/bin/sort -k1,1V -k2,2n"
+}
diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json
@@ -59,6 +59,10 @@
 		"time": "06:00:00",
 		"n": 8
 	},
+	"picard_qc": {
+		"time": "06:00:00",
+		"n": 8
+	},
 	"bwa_mem": {
 		"time": "08:00:00",
 		"n": 6
@@ -71,6 +75,14 @@
 		"time": "4:00:00",
 		"n": 18
 	},
+	"bcftools_normalise_vcfs": {
+		"time": "2:00:00",
+		"n": 2
+	},
+	"bcftools_concatenate_vcfs": {
+		"time": "2:00:00",
+		"n": 2
+	},
 	"cnvkit_segment_CNV_research": {
 		"time": "6:00:00",
 		"n": 18
@@ -116,6 +128,10 @@
 		"time": "60:00:00",
 		"n": 86
 	},
+	"genmod_score_snvs":{
+		"time": "05:00:00",
+		"n": 8
+	},
 	"manta_germline": {
 		"time": "05:00:00",
 		"n": 16
@@ -172,13 +188,25 @@
 		"time": "24:00:00",
 		"n": 12
 	},
-	"sentieon_TNhaplotyper": {
-		"time": "24:00:00",
-		"n": 36
+	"vardict_merge": {
+		"time": "01:30:00",
+		"n": 5
 	},
-	"sentieon_TNhaplotyper_tumor_only": {
-		"time": "24:00:00",
-		"n": 36
+	"vardict_sort": {
+		"time": "01:30:00",
+		"n": 1
+	},
+	"post_process_vardict": {
+		"time": "01:30:00",
+		"n": 2
+	},
+	"vardict_tumor_normal": {
+		"time": "12:00:00",
+		"n": 18
+	},
+	"vardict_tumor_only": {
+		"time": "10:00:00",
+		"n": 9
 	},
 	"sentieon_TNscope": {
 		"time": "24:00:00",
@@ -212,18 +240,6 @@
 		"time": "06:00:00",
 		"n": 10
 	},
-	"vardict_merge": {
-		"time": "01:30:00",
-		"n": 5
-	},
-	"vardict_tumor_normal": {
-		"time": "12:00:00",
-		"n": 18
-	},
-	"vardict_tumor_only": {
-		"time": "10:00:00",
-		"n": 9
-	},
 	"sentieon_bwa_umiextract": {
 		"time": "8:00:00",
 		"n": 36
@@ -244,6 +260,14 @@
 		"time": "4:00:00",
 		"n": 12
 	},
+	"sentieon_tnscope_tga_t_only": {
+		"time": "4:00:00",
+		"n": 16
+	},
+	"sentieon_tnscope_tga_tumor_normal": {
+		"time": "5:00:00",
+		"n": 32
+	},
 	"bcftools_get_somaticINDEL_research": {
 		"time": "1:00:00",
 		"n": 36
@@ -356,6 +380,14 @@
 		"time": "01:00:00",
 		"n": 8
 	},
+	"bcftools_split_tnscope_variants": {
+		"time": "01:00:00",
+		"n": 4
+	},
+	"modify_tnscope_infofield": {
+		"time": "01:00:00",
+		"n": 4
+	},
 	"vcf2cytosure_convert": {
 		"time": "02:00:00",
 		"n": 8
@@ -388,14 +420,6 @@
 		"time": "03:00:00",
 		"n": 8
 	},
-	"bcftools_filter_vardict_research_tumor_only": {
-		"time": "04:00:00",
-		"n": 8
-	},
-	"bcftools_filter_vardict_clinical_tumor_only": {
-		"time": "03:00:00",
-		"n": 8
-	},
 	"bcftools_filter_tnscope_umi_research_tumor_only": {
 		"time": "04:00:00",
 		"n": 8
@@ -404,14 +428,6 @@
 		"time": ":03:00:00",
 		"n": 8
 	},
-	"bcftools_filter_vardict_research_tumor_normal": {
-		"time": "04:00:00",
-		"n": 8
-	},
-	"bcftools_filter_vardict_clinical_tumor_normal": {
-		"time": "03:00:00",
-		"n": 8
-	},
 	"bcftools_filter_TNscope_umi_research_tumor_normal": {
 		"time": "04:00:00",
 		"n": 8