Skip to content

Commit

Permalink
Merge pull request #44 from msk-access/mutect_filter_fix
Browse files Browse the repository at this point in the history
Mutect2 filtering command
  • Loading branch information
rnaidu authored Sep 6, 2024
2 parents cfbbcdf + 141c9a3 commit 009ff78
Show file tree
Hide file tree
Showing 4 changed files with 296 additions and 4 deletions.
8 changes: 8 additions & 0 deletions postprocessing_variant_calls/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import typer
from .vardict import vardict_process
from .mutect import mutect_process
from .mutect import mutect2_process

# from .maf import main
from .maf import main
Expand All @@ -26,6 +27,13 @@
help="post-processing commands for MuTect version 1.1.5 VCFs.",
)

# muTect filter App
app.add_typer(
mutect2_process.app,
name="mutect2",
help="post-processing commands for MuTect version 2 VCFs.",
)

# Add Annote Maf
app.add_typer(
main.app,
Expand Down
145 changes: 145 additions & 0 deletions postprocessing_variant_calls/mutect/mutect2_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#!/usr/bin/env python
# imports
import os
import sys
import pandas as pd
import vcf
import time
import logging
from pathlib import Path
from typing import List, Optional
import typer
from vcf.parser import (
_Info as VcfInfo,
_Format as VcfFormat,
_vcf_metadata_parser as VcfMetadataParser,
)
from .mutect_class import mutect_sample
import typer


app = typer.Typer(help="post-processing commands for MuTect version 2 VCFs.")

single_app = typer.Typer()
app.add_typer(
single_app,
name="case-control",
help="Post-processing commands for filtering of MuTect version 2 VCF input file.",
)


@single_app.command("filter")
def filter(
inputVcf: Path = typer.Option(
...,
"--inputVcf",
"-i",
exists=True,
file_okay=True,
dir_okay=False,
writable=False,
readable=True,
resolve_path=True,
help="Input vcf generated by MuTect2 which needs to be processed",
),
inputTxt: Optional[Path] = typer.Option(
"/dev/null",
"--inputTxt",
"-it",
exists=True,
file_okay=True,
dir_okay=False,
writable=False,
readable=True,
resolve_path=True,
help="Input Txt generated by MuTect which needs to be processed. NOTE, a Txt file will not be used for Mutect2 filtering as it is not provided in standard output.",
),
refFasta: Optional[Path] = typer.Option(
"/dev/null",
"--refFasta",
exists=True,
file_okay=True,
dir_okay=False,
writable=False,
readable=True,
resolve_path=True,
help="Input reference fasta",
),
tsampleName: str = typer.Option(
..., "--tsampleName", help="Name of the tumor sample."
),
totalDepth: int = typer.Option(
20,
"--totalDepth",
"-dp",
min=0,
clamp=True,
help="Tumor total depth threshold",
),
alleleDepth: int = typer.Option(
1,
"--alleleDepth",
"-ad",
min=0,
clamp=True,
),
tnRatio: int = typer.Option(
1,
"--tnRatio",
"-tnr",
min=0,
clamp=True,
help="Tumor-Normal variant fraction ratio threshold",
),
variantFraction: float = typer.Option(
0.00005,
"--variantFraction",
"-vf",
min=0,
clamp=True,
help="Tumor variant fraction threshold",
),
outputDir: str = typer.Option(
"", "--outDir", "-o", help="Full Path to the output dir"
),
):
"""
This tool helps to filter MuTect version 2 VCFs for case-control calling
"""
typer.secho(
f"process_mutect2: Started the run for doing standard filter.",
fg=typer.colors.BLACK,
)
# single sample case
# create mutect object

to_filter = mutect_sample(
inputVcf,
inputTxt,
refFasta,
outputDir,
tsampleName,
totalDepth,
alleleDepth,
variantFraction,
tnRatio,
)

# check for normal
if to_filter.has_tumor_and_normal_cols():
vcf_out = to_filter.filter_mutect2_paired_sample()
typer.secho(
f"process_mutect2: Filtering for MuTect2 VCFs has completed. Please refer to output directory for filtered MuTect2 VCF and text file.",
fg=typer.colors.BLACK,
)
else:
typer.secho(
f"Tumor and normal columns not identified in input MuTect2 VCF file. Please check input file again.",
fg=typer.colors.RED,
)
raise typer.Abort()
return vcf_out


if __name__ == "__main__":
app()
139 changes: 135 additions & 4 deletions postprocessing_variant_calls/mutect/mutect_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,130 @@ def filter_paired_sample(self):

return self.vcf_out, self.txt_out

def filter_mutect2_paired_sample(self):
# TODO: contiue work on method, check with Karthi / Ronak about single filter
"""
@Description : The purpose of this function is to filter VCFs output from MuTect2 that tumor and normal sample info
@author : Rashmi Naidu
-input: self
-output:
- self.vcf_out
"""

vcf_writer = vcf.Writer(open(self.vcf_out, "w"), self.vcf_reader)

def should_remove(
tumor_total_depth, tad, tvf, totalDepth, alleleDepth, variantFraction, nvfRF
):
# Check the additional criteria (mark for removal if it fails)

if tvf > nvfRF:
if (
(tumor_total_depth >= int(self.totalDepth))
& (int(tad) >= int(self.alleleDepth))
& (tvf >= float(self.variantFraction))
):
return False
else:
return True
else:
return True

# If the caller reported the normal genotype column before the tumor, swap those around
if self.allsamples[1] == self.tsampleName:
self.vcf_reader.samples[0] = self.allsamples[1]
self.vcf_reader.samples[1] = self.allsamples[0]

for record in self.vcf_reader:
if len(record.ALT) > 1:
# Split the multiallele record into separate records
for alt_allele in record.ALT:
new_record = vcf.model._Record(
record.CHROM,
record.POS,
record.ID,
record.REF,
record.ALT,
record.QUAL,
record.FILTER,
record.INFO,
record.FORMAT,
record.genotype,
record.samples,
)

contig = new_record.CHROM
position = new_record.POS
ref_allele = new_record.REF
alt_allele = new_record.ALT
tumor_call = record.genotype(self.vcf_reader.samples[0])
norm_call = record.genotype(self.vcf_reader.samples[1])
t_ad_vals = tumor_call.data.AD
tad = tumor_call.data.AD[1]
n_ad_vals = norm_call.data.AD
nad = norm_call.data.AD[1]
tumor_total_depth = sum(map(int, t_ad_vals))
normal_total_depth = sum(map(int, n_ad_vals))

tdp, tvf = _tumor_variant_calculation(
int(tumor_call.data.AD[0]), int(tumor_call.data.AD[1])
)
ndp, nvf = _normal_variant_calculation(
int(norm_call.data.AD[0]), int(norm_call.data.AD[1])
)
nvfRF = _tvf_threshold_calculation(nvf, self.tnRatio)

if not should_remove(
tumor_total_depth,
tad,
tvf,
self.totalDepth,
self.alleleDepth,
self.variantFraction,
nvfRF,
):
vcf_writer.write_record(record)
else:
continue

else:
contig = record.CHROM
position = record.POS
ref_allele = record.REF
alt_allele = record.ALT
tumor_call = record.genotype(self.vcf_reader.samples[0])
norm_call = record.genotype(self.vcf_reader.samples[1])
t_ad_vals = tumor_call.data.AD
tad = tumor_call.data.AD[1]
n_ad_vals = norm_call.data.AD
nad = norm_call.data.AD[1]
tumor_total_depth = sum(map(int, t_ad_vals))
normal_total_depth = sum(map(int, n_ad_vals))

tdp, tvf = _tumor_variant_calculation(
int(tumor_call.data.AD[0]), int(tumor_call.data.AD[1])
)
ndp, nvf = _normal_variant_calculation(
int(norm_call.data.AD[0]), int(norm_call.data.AD[1])
)
nvfRF = _tvf_threshold_calculation(nvf, self.tnRatio)

if not should_remove(
tumor_total_depth,
tad,
tvf,
self.totalDepth,
self.alleleDepth,
self.variantFraction,
nvfRF,
):
vcf_writer.write_record(record)
else:
continue
vcf_writer.close()

return self.vcf_out


def _tumor_variant_calculation(trd, tad):
##############################
Expand Down Expand Up @@ -327,6 +451,7 @@ def _write_to_vcf(outDir, vcf_out, vcf_reader, allsamples, tsampleName, keepDict
)
if key_for_tracking in keepDict:
failure_reason = keepDict.get(key_for_tracking)
print(failure_reason)
# There was no failure reason for calls that had "KEEP" in their judgement column,
# but this code uses "KEEP" as the key when they are encountered
if failure_reason == "KEEP":
Expand All @@ -337,15 +462,21 @@ def _write_to_vcf(outDir, vcf_out, vcf_reader, allsamples, tsampleName, keepDict

# If the caller reported the normal genotype column before the tumor, swap those around

if allsamples[1] == tsampleName:
vcf_writer.samples[0] = allsamples[1]
vcf_writer.samples[1] = allsamples[0]

if record.FILTER == "PASS":
tum = record.samples[0]
nrm = record.samples[1]

record.samples[0] = tum
record.samples[1] = nrm
vcf_writer.write_record(record)
# Change the failure reason to PASS, for mutations for which we want to override MuTect's assessment
else:
record.FILTER = "PASS"
tum = record.samples[0]
nrm = record.samples[1]

record.samples[0] = tum
record.samples[1] = nrm
vcf_writer.write_record(record)
else:
continue
Expand Down
8 changes: 8 additions & 0 deletions tests/test_mutect.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,18 @@
runner = CliRunner()

mutect1_filter_calls = [["mutect1", "case-control", "filter", "--help"]]
mutect2_filter_calls = [["mutect2", "case-control", "filter", "--help"]]


# Command Test
@pytest.mark.parametrize("call", mutect1_filter_calls)
def test_mutect1_filter(call):
result = runner.invoke(app, call)
assert result.exit_code == 0


# Command Test
@pytest.mark.parametrize("call", mutect2_filter_calls)
def test_mutect2_filter(call):
result = runner.invoke(app, call)
assert result.exit_code == 0

0 comments on commit 009ff78

Please sign in to comment.