Merge pull request #44 from msk-access/mutect_filter_fix

Mutect2 filtering command
msk-access · Sep 6, 2024 · 009ff78 · 009ff78
2 parents cfbbcdf + 141c9a3
commit 009ff78
Show file tree

Hide file tree

Showing 4 changed files with 296 additions and 4 deletions.
diff --git a/postprocessing_variant_calls/main.py b/postprocessing_variant_calls/main.py
@@ -2,6 +2,7 @@
 import typer
 from .vardict import vardict_process
 from .mutect import mutect_process
+from .mutect import mutect2_process
 
 # from .maf import main
 from .maf import main
@@ -26,6 +27,13 @@
     help="post-processing commands for MuTect version 1.1.5 VCFs.",
 )
 
+# muTect filter App
+app.add_typer(
+    mutect2_process.app,
+    name="mutect2",
+    help="post-processing commands for MuTect version 2 VCFs.",
+)
+
 # Add Annote Maf
 app.add_typer(
     main.app,

diff --git a/postprocessing_variant_calls/mutect/mutect2_process.py b/postprocessing_variant_calls/mutect/mutect2_process.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python
+# imports
+import os
+import sys
+import pandas as pd
+import vcf
+import time
+import logging
+from pathlib import Path
+from typing import List, Optional
+import typer
+from vcf.parser import (
+    _Info as VcfInfo,
+    _Format as VcfFormat,
+    _vcf_metadata_parser as VcfMetadataParser,
+)
+from .mutect_class import mutect_sample
+import typer
+
+
+app = typer.Typer(help="post-processing commands for MuTect version 2 VCFs.")
+
+single_app = typer.Typer()
+app.add_typer(
+    single_app,
+    name="case-control",
+    help="Post-processing commands for filtering of MuTect version 2 VCF input file.",
+)
+
+
+@single_app.command("filter")
+def filter(
+    inputVcf: Path = typer.Option(
+        ...,
+        "--inputVcf",
+        "-i",
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        writable=False,
+        readable=True,
+        resolve_path=True,
+        help="Input vcf generated by MuTect2 which needs to be processed",
+    ),
+    inputTxt: Optional[Path] = typer.Option(
+        "/dev/null",
+        "--inputTxt",
+        "-it",
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        writable=False,
+        readable=True,
+        resolve_path=True,
+        help="Input Txt generated by MuTect which needs to be processed. NOTE, a Txt file will not be used for Mutect2 filtering as it is not provided in standard output.",
+    ),
+    refFasta: Optional[Path] = typer.Option(
+        "/dev/null",
+        "--refFasta",
+        exists=True,
+        file_okay=True,
+        dir_okay=False,
+        writable=False,
+        readable=True,
+        resolve_path=True,
+        help="Input reference fasta",
+    ),
+    tsampleName: str = typer.Option(
+        ..., "--tsampleName", help="Name of the tumor sample."
+    ),
+    totalDepth: int = typer.Option(
+        20,
+        "--totalDepth",
+        "-dp",
+        min=0,
+        clamp=True,
+        help="Tumor total depth threshold",
+    ),
+    alleleDepth: int = typer.Option(
+        1,
+        "--alleleDepth",
+        "-ad",
+        min=0,
+        clamp=True,
+    ),
+    tnRatio: int = typer.Option(
+        1,
+        "--tnRatio",
+        "-tnr",
+        min=0,
+        clamp=True,
+        help="Tumor-Normal variant fraction ratio threshold",
+    ),
+    variantFraction: float = typer.Option(
+        0.00005,
+        "--variantFraction",
+        "-vf",
+        min=0,
+        clamp=True,
+        help="Tumor variant fraction threshold",
+    ),
+    outputDir: str = typer.Option(
+        "", "--outDir", "-o", help="Full Path to the output dir"
+    ),
+):
+    """
+    This tool helps to filter MuTect version 2 VCFs for case-control calling
+    """
+    typer.secho(
+        f"process_mutect2: Started the run for doing standard filter.",
+        fg=typer.colors.BLACK,
+    )
+    # single sample case
+    # create mutect object
+
+    to_filter = mutect_sample(
+        inputVcf,
+        inputTxt,
+        refFasta,
+        outputDir,
+        tsampleName,
+        totalDepth,
+        alleleDepth,
+        variantFraction,
+        tnRatio,
+    )
+
+    # check for normal
+    if to_filter.has_tumor_and_normal_cols():
+        vcf_out = to_filter.filter_mutect2_paired_sample()
+        typer.secho(
+            f"process_mutect2: Filtering for MuTect2 VCFs has completed. Please refer to output directory for filtered MuTect2 VCF and text file.",
+            fg=typer.colors.BLACK,
+        )
+    else:
+        typer.secho(
+            f"Tumor and normal columns not identified in input MuTect2 VCF file. Please check input file again.",
+            fg=typer.colors.RED,
+        )
+        raise typer.Abort()
+    return vcf_out
+
+
+if __name__ == "__main__":
+    app()
diff --git a/postprocessing_variant_calls/mutect/mutect_class.py b/postprocessing_variant_calls/mutect/mutect_class.py
@@ -256,6 +256,130 @@ def filter_paired_sample(self):
 
         return self.vcf_out, self.txt_out
 
+    def filter_mutect2_paired_sample(self):
+        # TODO: contiue work on method, check with Karthi / Ronak about single filter
+        """
+        @Description : The purpose of this function is to filter VCFs output from MuTect2 that tumor and normal sample info
+        @author : Rashmi Naidu
+        -input: self
+        -output:
+            - self.vcf_out
+        """
+
+        vcf_writer = vcf.Writer(open(self.vcf_out, "w"), self.vcf_reader)
+
+        def should_remove(
+            tumor_total_depth, tad, tvf, totalDepth, alleleDepth, variantFraction, nvfRF
+        ):
+            # Check the additional criteria (mark for removal if it fails)
+
+            if tvf > nvfRF:
+                if (
+                    (tumor_total_depth >= int(self.totalDepth))
+                    & (int(tad) >= int(self.alleleDepth))
+                    & (tvf >= float(self.variantFraction))
+                ):
+                    return False
+                else:
+                    return True
+            else:
+                return True
+
+        # If the caller reported the normal genotype column before the tumor, swap those around
+        if self.allsamples[1] == self.tsampleName:
+            self.vcf_reader.samples[0] = self.allsamples[1]
+            self.vcf_reader.samples[1] = self.allsamples[0]
+
+        for record in self.vcf_reader:
+            if len(record.ALT) > 1:
+                # Split the multiallele record into separate records
+                for alt_allele in record.ALT:
+                    new_record = vcf.model._Record(
+                        record.CHROM,
+                        record.POS,
+                        record.ID,
+                        record.REF,
+                        record.ALT,
+                        record.QUAL,
+                        record.FILTER,
+                        record.INFO,
+                        record.FORMAT,
+                        record.genotype,
+                        record.samples,
+                    )
+
+                    contig = new_record.CHROM
+                    position = new_record.POS
+                    ref_allele = new_record.REF
+                    alt_allele = new_record.ALT
+                    tumor_call = record.genotype(self.vcf_reader.samples[0])
+                    norm_call = record.genotype(self.vcf_reader.samples[1])
+                    t_ad_vals = tumor_call.data.AD
+                    tad = tumor_call.data.AD[1]
+                    n_ad_vals = norm_call.data.AD
+                    nad = norm_call.data.AD[1]
+                    tumor_total_depth = sum(map(int, t_ad_vals))
+                    normal_total_depth = sum(map(int, n_ad_vals))
+
+                    tdp, tvf = _tumor_variant_calculation(
+                        int(tumor_call.data.AD[0]), int(tumor_call.data.AD[1])
+                    )
+                    ndp, nvf = _normal_variant_calculation(
+                        int(norm_call.data.AD[0]), int(norm_call.data.AD[1])
+                    )
+                    nvfRF = _tvf_threshold_calculation(nvf, self.tnRatio)
+
+                    if not should_remove(
+                        tumor_total_depth,
+                        tad,
+                        tvf,
+                        self.totalDepth,
+                        self.alleleDepth,
+                        self.variantFraction,
+                        nvfRF,
+                    ):
+                        vcf_writer.write_record(record)
+                    else:
+                        continue
+
+            else:
+                contig = record.CHROM
+                position = record.POS
+                ref_allele = record.REF
+                alt_allele = record.ALT
+                tumor_call = record.genotype(self.vcf_reader.samples[0])
+                norm_call = record.genotype(self.vcf_reader.samples[1])
+                t_ad_vals = tumor_call.data.AD
+                tad = tumor_call.data.AD[1]
+                n_ad_vals = norm_call.data.AD
+                nad = norm_call.data.AD[1]
+                tumor_total_depth = sum(map(int, t_ad_vals))
+                normal_total_depth = sum(map(int, n_ad_vals))
+
+                tdp, tvf = _tumor_variant_calculation(
+                    int(tumor_call.data.AD[0]), int(tumor_call.data.AD[1])
+                )
+                ndp, nvf = _normal_variant_calculation(
+                    int(norm_call.data.AD[0]), int(norm_call.data.AD[1])
+                )
+                nvfRF = _tvf_threshold_calculation(nvf, self.tnRatio)
+
+                if not should_remove(
+                    tumor_total_depth,
+                    tad,
+                    tvf,
+                    self.totalDepth,
+                    self.alleleDepth,
+                    self.variantFraction,
+                    nvfRF,
+                ):
+                    vcf_writer.write_record(record)
+                else:
+                    continue
+        vcf_writer.close()
+
+        return self.vcf_out
+
 
 def _tumor_variant_calculation(trd, tad):
     ##############################
@@ -327,6 +451,7 @@ def _write_to_vcf(outDir, vcf_out, vcf_reader, allsamples, tsampleName, keepDict
         )
         if key_for_tracking in keepDict:
             failure_reason = keepDict.get(key_for_tracking)
+            print(failure_reason)
             # There was no failure reason for calls that had "KEEP" in their judgement column,
             # but this code uses "KEEP" as the key when they are encountered
             if failure_reason == "KEEP":
@@ -337,15 +462,21 @@ def _write_to_vcf(outDir, vcf_out, vcf_reader, allsamples, tsampleName, keepDict
 
             # If the caller reported the normal genotype column before the tumor, swap those around
 
-            if allsamples[1] == tsampleName:
-                vcf_writer.samples[0] = allsamples[1]
-                vcf_writer.samples[1] = allsamples[0]
-
             if record.FILTER == "PASS":
+                tum = record.samples[0]
+                nrm = record.samples[1]
+
+                record.samples[0] = tum
+                record.samples[1] = nrm
                 vcf_writer.write_record(record)
             # Change the failure reason to PASS, for mutations for which we want to override MuTect's assessment
             else:
                 record.FILTER = "PASS"
+                tum = record.samples[0]
+                nrm = record.samples[1]
+
+                record.samples[0] = tum
+                record.samples[1] = nrm
                 vcf_writer.write_record(record)
         else:
             continue

diff --git a/tests/test_mutect.py b/tests/test_mutect.py
@@ -7,10 +7,18 @@
 runner = CliRunner()
 
 mutect1_filter_calls = [["mutect1", "case-control", "filter", "--help"]]
+mutect2_filter_calls = [["mutect2", "case-control", "filter", "--help"]]
 
 
 # Command Test
 @pytest.mark.parametrize("call", mutect1_filter_calls)
 def test_mutect1_filter(call):
     result = runner.invoke(app, call)
     assert result.exit_code == 0
+
+
+# Command Test
+@pytest.mark.parametrize("call", mutect2_filter_calls)
+def test_mutect2_filter(call):
+    result = runner.invoke(app, call)
+    assert result.exit_code == 0