Skip to content

Commit

Permalink
feat: merge vardict and tnscope (#1475)
Browse files Browse the repository at this point in the history
#### Changed

- Refactored rules for bcftools filters
- Renamed final UMI bamfile to ensure hsmetrics are collected in multiqc json
- Changed ranked VCF from research to clincial
- Lowered min AF for TGA from 0.007 to 0.005
- Lowered maximal SOR for TNscope in TGA tumor only cases from 3 to 2.7
- Changed filter settings for research TNscope vcf, now either PASS or triallelic_site (fixing this issue: #1293)

#### Added

- TNscope for TGA workflows, merged with VarDict results
- New filter for VarDict for tumor in normal contamination
- Export TMP environment variables to rules that lack them
- Added genmod ranked VCFs to be delivered
- Added family-id to genmod in order to get ranked variants to Scout (solved this: #1045)
- Added DP and AF to INFO-field of TNscope vcfs for ranking model
- Raw TNscope calls and unfiltered research-annotated SNVs to delivery

#### Removed

- ML-model for TNscope is removed due to license issue with new version of Sentieon
- All code associated with TNhaplotyper
- Removed research.filtered.pass VCFs from delivery and storage list
  • Loading branch information
mathiasbio authored Oct 14, 2024
1 parent 6890945 commit 57bdbe0
Show file tree
Hide file tree
Showing 65 changed files with 2,031 additions and 1,364 deletions.
Binary file added .DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion .github/workflows/black_linter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ jobs:
- uses: psf/black@stable
with:
options: "--check --verbose"
version: "22.3.0"
version: "23.7.0"
109 changes: 109 additions & 0 deletions BALSAMIC/assets/scripts/modify_tnscope_infofield.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python
import vcfpy
import click
import sys
import logging
from typing import List, Optional

LOG = logging.getLogger(__name__)


def summarize_ad_to_dp(ad_list):
"""
Summarizes the AD (allelic depth) field into total DP (read depth).
Parameters:
ad_list (list): List of read depths supporting each allele, [ref_depth, alt1_depth, alt2_depth, ...]
Returns:
int: Total read depth (DP) across all alleles.
"""
if ad_list is None:
return 0 # Return 0 if AD field is not present
return sum(ad_list)


@click.command()
@click.argument("input_vcf", type=click.Path(exists=True))
@click.argument("output_vcf", type=click.Path())
def process_vcf(input_vcf: str, output_vcf: str):
"""
Processes the input VCF file and writes the updated information to the output VCF file.
INPUT_VCF: Path to the input VCF file.
OUTPUT_VCF: Path to the output VCF file.
"""

# Open the input VCF file
reader: vcfpy.Reader = vcfpy.Reader.from_path(input_vcf)

# Ensure the sample name is 'TUMOR'
sample_name: str = reader.header.samples.names[0]
if sample_name != "TUMOR":
LOG.warning(
f"Error: The first sample is named '{sample_name}', but 'TUMOR' is expected."
)
sys.exit(1)

# Add AF and DP fields to the header if not already present
if "AF" not in reader.header.info_ids():
reader.header.add_info_line(
vcfpy.OrderedDict(
[
("ID", "AF"),
("Number", "A"),
("Type", "Float"),
("Description", "Allele Frequency"),
]
)
)

if "DP" not in reader.header.info_ids():
reader.header.add_info_line(
vcfpy.OrderedDict(
[
("ID", "DP"),
("Number", "1"),
("Type", "Integer"),
("Description", "Total Depth"),
]
)
)

# Open the output VCF file for writing
with vcfpy.Writer.from_path(output_vcf, reader.header) as writer:
# Loop through each record (variant)
for record in reader:
# Get the TUMOR sample data
sample_index: int = reader.header.samples.names.index(sample_name)
tumor_call: vcfpy.Call = record.calls[sample_index]

# Check and process AD field
tumor_ad: Optional[List[int]] = tumor_call.data.get(
"AD", None
) # AD is a list [ref_count, alt_count]
if tumor_ad is None:
LOG.warning(
f"Warning: AD field is missing for record at position {record.POS} on {record.CHROM}"
)
else:
record.INFO["DP"] = summarize_ad_to_dp(tumor_ad)

# Check and process AF field
tumor_af: Optional[float] = tumor_call.data.get("AF", None)
if tumor_af is None:
LOG.warning(
f"Warning: AF field is missing for record at position {record.POS} on {record.CHROM}"
)
record.INFO["AF"] = [0.0] # Default AF to 0.0 if missing
else:
record.INFO["AF"] = [tumor_af] # Wrap AF in a list

# Write the updated record to the output VCF file
writer.write_record(record)

click.echo(f"VCF file processed and saved to {output_vcf}")


if __name__ == "__main__":
process_vcf()
16 changes: 16 additions & 0 deletions BALSAMIC/assets/scripts/sort_vcf.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/awk -f

BEGIN {
ENVIRON["LC_ALL"] = "en_US.UTF-8"
}

# If the line starts with a '#', it's a header, so print it as is
$1 ~ /^#/ {
print $0;
next;
}

# Otherwise, send the body lines to an external sort command
{
print $0 | "/usr/bin/sort -k1,1V -k2,2n"
}
84 changes: 50 additions & 34 deletions BALSAMIC/constants/cluster_analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@
"time": "06:00:00",
"n": 8
},
"picard_qc": {
"time": "06:00:00",
"n": 8
},
"bwa_mem": {
"time": "08:00:00",
"n": 6
Expand All @@ -71,6 +75,14 @@
"time": "4:00:00",
"n": 18
},
"bcftools_normalise_vcfs": {
"time": "2:00:00",
"n": 2
},
"bcftools_concatenate_vcfs": {
"time": "2:00:00",
"n": 2
},
"cnvkit_segment_CNV_research": {
"time": "6:00:00",
"n": 18
Expand Down Expand Up @@ -116,6 +128,10 @@
"time": "60:00:00",
"n": 86
},
"genmod_score_snvs":{
"time": "05:00:00",
"n": 8
},
"manta_germline": {
"time": "05:00:00",
"n": 16
Expand Down Expand Up @@ -172,13 +188,25 @@
"time": "24:00:00",
"n": 12
},
"sentieon_TNhaplotyper": {
"time": "24:00:00",
"n": 36
"vardict_merge": {
"time": "01:30:00",
"n": 5
},
"sentieon_TNhaplotyper_tumor_only": {
"time": "24:00:00",
"n": 36
"vardict_sort": {
"time": "01:30:00",
"n": 1
},
"post_process_vardict": {
"time": "01:30:00",
"n": 2
},
"vardict_tumor_normal": {
"time": "12:00:00",
"n": 18
},
"vardict_tumor_only": {
"time": "10:00:00",
"n": 9
},
"sentieon_TNscope": {
"time": "24:00:00",
Expand Down Expand Up @@ -212,18 +240,6 @@
"time": "06:00:00",
"n": 10
},
"vardict_merge": {
"time": "01:30:00",
"n": 5
},
"vardict_tumor_normal": {
"time": "12:00:00",
"n": 18
},
"vardict_tumor_only": {
"time": "10:00:00",
"n": 9
},
"sentieon_bwa_umiextract": {
"time": "8:00:00",
"n": 36
Expand All @@ -244,6 +260,14 @@
"time": "4:00:00",
"n": 12
},
"sentieon_tnscope_tga_t_only": {
"time": "4:00:00",
"n": 16
},
"sentieon_tnscope_tga_tumor_normal": {
"time": "5:00:00",
"n": 32
},
"bcftools_get_somaticINDEL_research": {
"time": "1:00:00",
"n": 36
Expand Down Expand Up @@ -356,6 +380,14 @@
"time": "01:00:00",
"n": 8
},
"bcftools_split_tnscope_variants": {
"time": "01:00:00",
"n": 4
},
"modify_tnscope_infofield": {
"time": "01:00:00",
"n": 4
},
"vcf2cytosure_convert": {
"time": "02:00:00",
"n": 8
Expand Down Expand Up @@ -388,14 +420,6 @@
"time": "03:00:00",
"n": 8
},
"bcftools_filter_vardict_research_tumor_only": {
"time": "04:00:00",
"n": 8
},
"bcftools_filter_vardict_clinical_tumor_only": {
"time": "03:00:00",
"n": 8
},
"bcftools_filter_tnscope_umi_research_tumor_only": {
"time": "04:00:00",
"n": 8
Expand All @@ -404,14 +428,6 @@
"time": ":03:00:00",
"n": 8
},
"bcftools_filter_vardict_research_tumor_normal": {
"time": "04:00:00",
"n": 8
},
"bcftools_filter_vardict_clinical_tumor_normal": {
"time": "03:00:00",
"n": 8
},
"bcftools_filter_TNscope_umi_research_tumor_normal": {
"time": "04:00:00",
"n": 8
Expand Down
Loading

0 comments on commit 57bdbe0

Please sign in to comment.