diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 5f1a2dc17ab..5e2920f8204 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -3,6 +3,10 @@ import com.google.common.collect.HashBiMap; import htsjdk.variant.variantcontext.Allele; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.HashSet; import java.util.Map; import static java.util.Map.entry; @@ -161,6 +165,39 @@ public enum ComplexVariantSubtype { public static final String LOW_QS_SCORE_FILTER_KEY = "LOW_QS"; public static final String FREQUENCY_FILTER_KEY = "FREQ"; + // CleanVcf + public static final String ME = "ME"; + public static final String VAR_GQ = "varGQ"; + public static final String MULTIALLELIC = "MULTIALLELIC"; + public static final String UNRESOLVED = "UNRESOLVED"; + public static final String HIGH_SR_BACKGROUND = "HIGH_SR_BACKGROUND"; + public static final String BOTHSIDES_SUPPORT = "BOTHSIDES_SUPPORT"; + public static final String PESR_GT_OVERDISPERSION = "PESR_GT_OVERDISPERSION"; + public static final String REVISED_EVENT = "REVISED_EVENT"; + public static final String MULTI_CNV = "MULTI_CNV"; + + public static final String RD_CN = "RD_CN"; + public static final String RD_GQ = "RD_GQ"; + public static final String PE_GT = "PE_GT"; + public static final String SR_GT = "SR_GT"; + public static final String PE_GQ = "PE_GQ"; + public static final String SR_GQ = "SR_GQ"; + public static final String CNV = "CNV"; + public static final String UNR = "UNR"; + public static final String EVENT = "EVENT"; + public static final String EV = "EV"; + public static final List EV_VALUES = Arrays.asList( + null, "RD", "PE", "RD,PE", "SR", "RD,SR", "PE,SR", "RD,PE,SR" + ); + public static final Set FILTER_VCF_LINES = new HashSet<>(Arrays.asList( + "CIPOS", "CIEND", "RMSSTD", "source", "bcftools", "GATKCommandLine", "#CHROM" + )); + + public static final Set FILTER_VCF_INFO_LINES = new HashSet<>(Arrays.asList( + GATKSVVCFConstants.UNRESOLVED, GATKSVVCFConstants.MULTIALLELIC, GATKSVVCFConstants.VAR_GQ, + GATKSVVCFConstants.MULTI_CNV, GATKSVVCFConstants.REVISED_EVENT, GATKSVVCFConstants.EVENT + )); + // Clustering public static final String CLUSTER_MEMBER_IDS_KEY = "MEMBERS"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java new file mode 100644 index 00000000000..07a29d95f5a --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -0,0 +1,337 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.VCFFilterHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineType; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.utils.tsv.TableUtils; +import org.broadinstitute.hellbender.utils.tsv.TableReader; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Path; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Cleansed VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public final class SVCleanPt1a extends VariantWalker { + public static final String CHRX_LONG_NAME = "chr-X"; + public static final String CHRY_LONG_NAME = "chr-Y"; + public static final String FAIL_LIST_LONG_NAME = "fail-list"; + public static final String PASS_LIST_LONG_NAME = "pass-list"; + public static final String OUTPUT_SAMPLES_LIST_LONG_NAME = "output-samples-list"; + + @Argument( + fullName = CHRX_LONG_NAME, + doc = "chrX column name", + optional = true + ) + private final String chrX = "chrX"; + + @Argument( + fullName = CHRY_LONG_NAME, + doc = "chrY column name", + optional = true + ) + private final String chrY = "chrY"; + + @Argument( + fullName = FAIL_LIST_LONG_NAME, + doc = "File with variants failing the background test" + ) + private GATKPath failList; + + @Argument( + fullName = PASS_LIST_LONG_NAME, + doc = "File with variants passing both sides" + ) + private GATKPath passList; + + @Argument( + fullName = OUTPUT_SAMPLES_LIST_LONG_NAME, + doc = "Output file with samples" + ) + private GATKPath outputSamplesList; + + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + private BufferedWriter samplesWriter = null; + + private Set failSet; + private Set passSet; + + private static final int MIN_ALLOSOME_EVENT_SIZE = 5000; + + @Override + public void onTraversalStart() { + // Read supporting files + failSet = readLastColumn(failList); + passSet = readLastColumn(passList); + + // Add new header lines + VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.UNRESOLVED, "Variant is unresolved")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.HIGH_SR_BACKGROUND, 0, VCFHeaderLineType.Flag, "High number of SR splits in background samples indicating messy region")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.BOTHSIDES_SUPPORT, 0, VCFHeaderLineType.Flag, "Variant has read-level support for both sides of breakpoint")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.REVISED_EVENT, 0, VCFHeaderLineType.Flag, "Variant has been revised due to a copy number mismatch")); + + // Write header + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(header); + + // Write samples list + try { + samplesWriter = new BufferedWriter(new FileWriter(outputSamplesList.toPath().toFile())); + for (String sample : header.getGenotypeSamples()) { + samplesWriter.write(sample); + samplesWriter.newLine(); + } + samplesWriter.flush(); + } catch (IOException e) { + throw new RuntimeException("Can't create output file", e); + } + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + // Create core data + VariantContextBuilder builder = new VariantContextBuilder(variant); + List genotypes = variant.getGenotypes(); + + // Process variant + genotypes = processEV(genotypes); + processVarGQ(variant, builder); + processMultiallelic(variant, builder); + processUnresolved(variant, builder); + processNoisyEvents(variant, builder); + processBothsidesSupportEvents(variant, builder); + genotypes = processAllosomes(variant, builder, genotypes); + + builder.genotypes(genotypes); + vcfWriter.add(builder.make()); + } + + private List processEV(final List genotypes) { + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.EV)) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + String evAttribute = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.EV); + final int evIndex = Integer.parseInt(evAttribute); + if (evIndex >= 0 && evIndex < GATKSVVCFConstants.EV_VALUES.size()) { + gb.attribute(GATKSVVCFConstants.EV, GATKSVVCFConstants.EV_VALUES.get(evIndex)); + } + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); + } + } + return updatedGenotypes; + } + + private void processVarGQ(final VariantContext variant, final VariantContextBuilder builder) { + if (variant.hasAttribute(GATKSVVCFConstants.VAR_GQ)) { + final double varGQ = variant.getAttributeAsDouble(GATKSVVCFConstants.VAR_GQ, 0); + builder.rmAttribute(GATKSVVCFConstants.VAR_GQ); + builder.log10PError(varGQ / -10.0); + } + } + + private void processMultiallelic(final VariantContext variant, final VariantContextBuilder builder) { + if (variant.hasAttribute(GATKSVVCFConstants.MULTIALLELIC)) { + builder.rmAttribute(GATKSVVCFConstants.MULTIALLELIC); + } + } + + private void processUnresolved(final VariantContext variant, final VariantContextBuilder builder) { + if (variant.hasAttribute(GATKSVVCFConstants.UNRESOLVED)) { + builder.rmAttribute(GATKSVVCFConstants.UNRESOLVED); + builder.filter(GATKSVVCFConstants.UNRESOLVED); + } + } + + private void processNoisyEvents(final VariantContext variant, final VariantContextBuilder builder) { + if (failSet.contains(variant.getID())) { + builder.attribute(GATKSVVCFConstants.HIGH_SR_BACKGROUND, true); + } + } + + private void processBothsidesSupportEvents(final VariantContext variant, final VariantContextBuilder builder) { + if (passSet.contains(variant.getID())) { + builder.attribute(GATKSVVCFConstants.BOTHSIDES_SUPPORT, true); + } + } + + private List processAllosomes(final VariantContext variant, final VariantContextBuilder builder, final List genotypes) { + final String chromosome = variant.getContig(); + if (!chromosome.equals(chrX) && !chromosome.equals(chrY)) { + return genotypes; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) && + (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_ALLOSOME_EVENT_SIZE)) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + final boolean isY = chromosome.equals(chrY); + final int sex = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT); + if (sex == 1 && isRevisableEvent(variant, isY, sex)) { // Male + builder.attribute(GATKSVVCFConstants.REVISED_EVENT, true); + adjustMaleGenotype(genotype, gb, svType); + } else if (sex == 2 && isY) { // Female + gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + } else if (sex == 0) { // Unknown + gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + } + updatedGenotypes.add(gb.make()); + } + else { + updatedGenotypes.add(genotype); + } + } + return updatedGenotypes; + } + + private void adjustMaleGenotype(final Genotype genotype, final GenotypeBuilder gb, final String svType) { + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + final int rdCN = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + gb.attribute(GATKSVVCFConstants.RD_CN, rdCN + 1); + + final Allele refAllele = genotype.getAllele(0); + final Allele altAllele = genotype.getAllele(1); + if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + if (rdCN >= 1) gb.alleles(Arrays.asList(refAllele, refAllele)); + else if (rdCN == 0) gb.alleles(Arrays.asList(refAllele, altAllele)); + } else if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) { + if (rdCN <= 1) gb.alleles(Arrays.asList(refAllele, refAllele)); + else if (rdCN == 2) gb.alleles(Arrays.asList(refAllele, altAllele)); + else gb.alleles(Arrays.asList(altAllele, altAllele)); + } + } + } + + private boolean isRevisableEvent(final VariantContext variant, final boolean isY, final int sex) { + final List genotypes = variant.getGenotypes(); + final int[] maleCounts = new int[4]; + final int[] femaleCounts = new int[4]; + for (final Genotype genotype : genotypes) { + final int rdCN = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, -1); + final int rdCNVal = Math.min(rdCN, 3); + if (rdCNVal == -1) continue; + + if (sex == 1) { + maleCounts[rdCNVal]++; + } else if (sex == 2) { + femaleCounts[rdCNVal]++; + } + } + + final int maleMedian = calcMedianDistribution(maleCounts); + final int femaleMedian = calcMedianDistribution(femaleCounts); + return maleMedian == 2 && (isY ? femaleMedian == 0 : femaleMedian == 4); + } + + private int calcMedianDistribution(final int[] counts) { + final int total = Arrays.stream(counts).sum(); + if (total == 0) return -1; + + final int target = total / 2; + int runningTotal = 0; + for (int i = 0; i < 4; i++) { + runningTotal += counts[i]; + if (runningTotal == target) { + return i * 2 + 1; + } else if (runningTotal > target) { + return i * 2; + } + } + throw new RuntimeException("Error calculating median"); + } + + private Set readLastColumn(final GATKPath filePath) { + try { + final Path path = filePath.toPath(); + final TableReader reader = TableUtils.reader(path, (columns, exceptionFactory) -> + (dataline) -> dataline.get(columns.columnCount() - 1) + ); + + Set result = reader.stream().collect(Collectors.toSet()); + reader.close(); + return result; + } catch (IOException e) { + throw new RuntimeException("Error reading variant list file: " + filePath, e); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java new file mode 100644 index 00000000000..46ec2a56a73 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -0,0 +1,335 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Cleansed VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVCleanPt1b extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + private final List overlappingVariantsBuffer = new ArrayList<>(); + private final Map>> revisedEventsAll = new HashMap<>(); + private final Map> revisedEventsFiltered = new HashMap<>(); + private final Map> revisedRdCn = new HashMap<>(); + + private static final int MIN_VARIANT_SIZE = 5000; + + @Override + protected int numberOfPasses() { + return 3; + } + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.MULTI_CNV, 0, VCFHeaderLineType.Flag, "Variant is a multiallelic CNV")); + vcfWriter.writeHeader(header); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext, final int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + case 2: + thirdPassApply(variant); + break; + default: + throw new IllegalArgumentException("Invalid pass number: " + n); + } + } + + @Override + protected void afterNthPass(final int n) { + switch (n) { + case 0: + processCollectedVariants(); + break; + } + } + + public void firstPassApply(final VariantContext variant) { + if (!isDelDup(variant) || !isLarge(variant, MIN_VARIANT_SIZE)) { + return; + } + + // Process overlaps with variants in the buffer + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processOverlap(bufferedVariant, variant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + if (!revisedEventsFiltered.containsKey(variant.getID())) { + return; + } + + // Initialize data structures + final String variantId = variant.getID(); + final Set samples = revisedEventsFiltered.get(variantId); + final Map variantRdCn = new HashMap<>(); + + // Initialize revisedRdCn value for each variant + for (final String sampleName : samples) { + final Genotype genotype = variant.getGenotype(sampleName); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + variantRdCn.put(sampleName, Integer.parseInt(rdCn)); + } + revisedRdCn.put(variantId, variantRdCn); + } + + public void thirdPassApply(final VariantContext variant) { + VariantContextBuilder builder = new VariantContextBuilder(variant); + if (revisedEventsAll.containsKey(variant.getID())) { + processVariant(builder, variant); + } + processCnvs(builder, variant); + vcfWriter.add(builder.make()); + } + + private void processOverlap(final VariantContext v1, final VariantContext v2) { + // Get overlap data + VariantContext wider; + VariantContext narrower; + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + if (length1 > length2) { + wider = v1; + narrower = v2; + } else if (length2 > length1) { + wider = v2; + narrower = v1; + } else { + return; + } + String widerID = wider.getID(); + String narrowerID = narrower.getID(); + + // Skip processing if same variant ID, SV type or samples + String widerSvType = wider.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + String narrowerSvType = narrower.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + Set widerSamples = getNonReferenceSamples(wider); + Set narrowerSamples = getNonReferenceSamples(narrower); + if (widerID.equals(narrowerID) || widerSvType.equals(narrowerSvType) || widerSamples.equals(narrowerSamples)) { + return; + } + + // Get samples present in wider but not in narrower + Set nonCommonSamples = new HashSet<>(widerSamples); + nonCommonSamples.removeAll(narrowerSamples); + if (nonCommonSamples.isEmpty()) { + return; + } + + // Revise variant if coverage exceeds threshold + double coverage = getCoverage(wider, narrower); + if (coverage >= 0.5) { + for (String sample : nonCommonSamples) { + revisedEventsAll.computeIfAbsent(narrowerID, k -> new HashMap<>()) + .put(sample, new ImmutablePair<>(widerID, widerSvType)); + } + } + } + + private void processCollectedVariants() { + // Prunes variant-sample pairs we need RD_CN values for + for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { + for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { + final String sampleName = innerEntry.getKey(); + final String variantId = entry.getKey(); + final String widerVariantId = innerEntry.getValue().getLeft(); + final String svType = innerEntry.getValue().getRight(); + if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); + revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); + } + } + } + } + + private void processVariant(final VariantContextBuilder builder, final VariantContext variant) { + // Initialize data structures + final String variantId = variant.getID(); + final Map> variantEvents = revisedEventsAll.get(variantId); + final List newGenotypes = new ArrayList<>(); + + // Create updated genotypes + for (String sample : variant.getSampleNamesOrderedByName()) { + final Genotype oldGenotype = variant.getGenotype(sample); + final Pair event = variantEvents.get(sample); + + if (event != null) { + final String widerVariantId = event.getLeft(); + final String widerSvType = event.getRight(); + final int currentRdCn = revisedRdCn.get(variantId).getOrDefault(sample, 0); + final int widerRdCn = revisedRdCn.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sample, 0); + + int newVal = -1; + if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { + newVal = 1; + } else if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && widerRdCn == 1) { + newVal = 3; + } + + if (newVal != -1) { + final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + if (!oldGenotype.hasExtendedAttribute(GATKSVVCFConstants.RD_GQ)) continue; + + gb.GQ(Integer.parseInt((String) oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ))); + newGenotypes.add(gb.make()); + } else { + newGenotypes.add(oldGenotype); + } + } else { + newGenotypes.add(oldGenotype); + } + } + builder.genotypes(newGenotypes); + } + + private void processCnvs(final VariantContextBuilder builder, final VariantContext variant) { + final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + for (String sample : variant.getSampleNamesOrderedByName()) { + final Genotype genotype = variant.getGenotype(sample); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final String rdCnString = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + final int rdCn = Integer.parseInt(rdCnString); + if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { + builder.attribute(GATKSVVCFConstants.MULTI_CNV, true); + break; + } + } + } + + private boolean isDelDup(final VariantContext variant) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); + } + + private boolean isLarge(final VariantContext variant, final int minSize) { + int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + return variantLength >= minSize; + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } + + private Set getNonReferenceSamples(final VariantContext variant) { + Set samples = new HashSet<>(); + for (String sampleName : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sampleName); + if (genotype.isCalled() && !genotype.isHomRef()) { + samples.add(sampleName); + } + } + return samples; + } + + private double getCoverage(final VariantContext wider, final VariantContext narrower) { + int nStart = narrower.getStart(); + int nStop = narrower.getEnd(); + int wStart = wider.getStart(); + int wStop = wider.getEnd(); + + if (wStart <= nStop && nStart <= wStop) { + int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart) + 1; + return (double) intersectionSize / (nStop - nStart + 1); + } + return 0.0; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java new file mode 100644 index 00000000000..6b852d44e79 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -0,0 +1,337 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; +import java.util.Collections; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Cleansed VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVCleanPt2 extends VariantWalker { + public static final String SAMPLE_LIST_LONG_NAME = "sample-list"; + public static final String OUTPUT_REVISED_LIST_LONG_NAME = "output-revised-list"; + + @Argument( + fullName = SAMPLE_LIST_LONG_NAME, + doc = "File with samples to include" + ) + private GATKPath sampleListPath; + + @Argument( + fullName = OUTPUT_REVISED_LIST_LONG_NAME, + doc = "Prefix for output files" + ) + private GATKPath outputRevisedList; + + private BufferedWriter revisedCnWriter; + + private Set sampleWhitelist; + + private final Map> abnormalRdCn = new HashMap<>(); + private final List overlappingVariantsBuffer = new ArrayList<>(); + private final Map> revisedCopyNumbers = new HashMap<>(); + private final Set revisedComplete = new HashSet<>(); + + private static final int MIN_VARIANT_SIZE = 5000; + + @Override + public void onTraversalStart() { + try { + revisedCnWriter = Files.newBufferedWriter(Paths.get(outputRevisedList.toString())); + + sampleWhitelist = new HashSet<>(Files.readAllLines(sampleListPath.toPath())); + } catch (IOException e) { + throw new RuntimeException("Error reading input file", e); + } + } + + @Override + public Object onTraversalSuccess() { + try { + List variantIDs = new ArrayList<>(revisedCopyNumbers.keySet()); + Collections.sort(variantIDs); + + for (String variantID : variantIDs) { + Map sampleMap = revisedCopyNumbers.get(variantID); + + List samples = new ArrayList<>(sampleMap.keySet()); + Collections.sort(samples); + + for (String sample : samples) { + int rdCn = sampleMap.get(sample); + revisedCnWriter.write(variantID + "\t" + sample + "\t" + rdCn); + revisedCnWriter.newLine(); + } + } + + if (revisedCnWriter != null) { + revisedCnWriter.close(); + } + + return null; + } catch (IOException e) { + throw new RuntimeException("Error writing output file", e); + } + } + + @Override + public void apply(final VariantContext variant, final ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + // Skip if not expected SVTYPE or below SVLEN threshold + if (!isDelDup(variant) || !isLarge(variant, MIN_VARIANT_SIZE)) { + return; + } + + // Flag sample as having an abnormal copy number if it passes certain conditions + for (String sample : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sample); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + if (!sampleWhitelist.contains(sample) || !genotype.isCalled() || rdCn == 2) { + continue; + } + + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && rdCn < 2) || (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && rdCn > 2)) { + abnormalRdCn.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(sample); + } + } + + // Process overlaps with variants in the buffer + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(variant, bufferedVariant)) { + adjustCopyNumber(variant, bufferedVariant); + } + } + overlappingVariantsBuffer.add(variant); + } + + private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) { + // Determine larger variant + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int largerLength = length1; + int smallerLength = length2; + + // Swap variants if necessary + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + largerLength = length2; + smallerLength = length1; + } + + // Get variant attributes + String variantId1 = largerVariant.getID(); + String variantId2 = smallerVariant.getID(); + Map variantRdCn1 = getRdCnForVariant(largerVariant); + Map variantRdCn2 = getRdCnForVariant(smallerVariant); + Map> variantSupport1 = getSupportForVariant(largerVariant); + Map> variantSupport2 = getSupportForVariant(smallerVariant); + String svType1 = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + String svType2 = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + + // Calculate overlap + int minEnd = Math.min( + largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), + smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) + ); + int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + int lengthOverlap = minEnd - maxStart + 1; + double overlap1 = (double) lengthOverlap / (double) largerLength; + double overlap2 = (double) lengthOverlap / (double) smallerLength; + + // Get samples with abnormal CN across both variants + Set samples = new HashSet<>(abnormalRdCn.getOrDefault(variantId1, Collections.emptySet())); + samples.retainAll(abnormalRdCn.getOrDefault(variantId2, Collections.emptySet())); + + // Iterate through samples to test against conditions + for (String sample : samples) { + String id1 = variantId1 + "@" + sample; + String id2 = variantId2 + "@" + sample; + if (revisedComplete.contains(id1)) { + continue; + } + + // Initialize variables for evaluation + int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); + int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); + Set support1 = variantSupport1.get(sample); + Set support2 = variantSupport2.get(sample); + Genotype genotype2 = smallerVariant.getGenotype(sample); + + // Condition 1: Smaller depth call is being driven by larger call + if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV)) { + if (rdCn1 == 0) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1) { + makeRevision(id2, rdCn2 + rdCn1); + } else if (rdCn1 > 1) { + int newCN = rdCn2 - rdCn1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } + } + + // Condition 2: Smaller CNV is driven by larger CNV genotype + else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 + && overlap1 > 0.5 && overlap2 > 0.5 && !smallerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) + && !genotype2.isHomRef()) { + if (rdCn2 == 0) { + makeRevision(id1, rdCn1 + 2); + } else if (rdCn2 == 1) { + makeRevision(id1, rdCn1 + rdCn2); + } else if (rdCn2 > 1) { + int newCN = rdCn1 - rdCn2 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id1, newCN); + } + } + + // Condition 3: Depth-only calls where smaller call is driven by larger call + else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && svType1.equals(svType2)) { + if (rdCn1 == 0 && rdCn1 != rdCn2) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1 && rdCn1 > rdCn2) { + makeRevision(id2, 1); + } else if (rdCn1 > 1 && rdCn1 < rdCn2) { + int newCN = rdCn2 - rdCn1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } else { + makeRevision(id2, 2); + } + } + + // Condition 4: Any other time a larger call drives a smaller call + else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && largerLength > MIN_VARIANT_SIZE) { + if (rdCn1 == 0) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1) { + makeRevision(id2, rdCn2 + rdCn1); + } else if (rdCn1 > 1) { + int newCN = rdCn2 - rdCn1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } + } + } + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } + + private boolean isDelDup(final VariantContext variant) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); + } + + private boolean isLarge(final VariantContext variant, final int minSize) { + return Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) >= minSize; + } + + private Map> getSupportForVariant(final VariantContext variant) { + Map> supportMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sample); + String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() : ""; + Set supportSet = new HashSet<>(); + if (!supportStr.isEmpty()) { + supportSet.addAll(Arrays.asList(supportStr.split(","))); + } + supportMap.put(sample, supportSet); + } + return supportMap; + } + + private Map getRdCnForVariant(final VariantContext variant) { + Map rdCnMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sample); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + rdCnMap.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + return rdCnMap; + } + + private void makeRevision(final String id, final int val) { + String[] tokens = id.split("@"); + String variantId = tokens[0]; + String sample = tokens[1]; + revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); + if (val == 2) { + revisedComplete.add(id); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java new file mode 100644 index 00000000000..af96fd295df --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -0,0 +1,448 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Allele; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; + +import htsjdk.variant.vcf.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Cleansed VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "SClean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVCleanPt4 extends VariantWalker { + public static final String REVISED_CN_LIST_LONG_NAME = "revised-cn-list"; + public static final String OUTLIERS_LIST_LONG_NAME = "outliers-list"; + + @Argument( + fullName = REVISED_CN_LIST_LONG_NAME, + doc = "File with variant IDs, sample IDs, and RD_CN values" + ) + private GATKPath cnReviseList; + + @Argument( + fullName = OUTLIERS_LIST_LONG_NAME, + doc = "File with outlier samples", + optional = true + ) + private GATKPath outliersListPath; + + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + private Map> revisedCopyNumbers; + private Set outlierSamples; + + private double recordStart; + private double recordEnd; + private long recordIdx; + private double maxVF; + + private static final int MIN_LARGE_EVENT_SIZE = 1000; + private static final int MIN_MULTIALLELIC_EVENT_SIZE = 5000; + + @Override + public void onTraversalStart() { + // Read and parse input files + try { + revisedCopyNumbers = readRevisedEvents(cnReviseList); + outlierSamples = new HashSet<>(); + if (outliersListPath != null) { + outlierSamples = new HashSet<>(Files.readAllLines(outliersListPath.toPath())); + } + } catch (IOException e) { + throw new RuntimeException("Error reading input file", e); + } + + // Parse batch-level metadata + String cnReviseListFileName = cnReviseList.toPath().getFileName().toString(); + String[] regenoFileNameTokens = cnReviseListFileName.split("\\."); + String[] batchTokens = regenoFileNameTokens[1].split("_"); + int batchNum = Math.max(Integer.parseInt(batchTokens[0]), 1); + int totalBatch = Math.max(Integer.parseInt(batchTokens[1]), 1); + long totalNumVariants = 0; + String inputVcfPath = getDrivingVariantsFeatureInput().getFeaturePath(); + try (FeatureDataSource dataSource = new FeatureDataSource<>(inputVcfPath)) { + for (VariantContext ignored : dataSource) { + totalNumVariants++; + } + } + double segments = totalNumVariants / (double) totalBatch; + recordStart = (batchNum - 1) * segments; + recordEnd = batchNum * segments; + maxVF = Math.max((getHeaderForVariants().getGenotypeSamples().size() - outlierSamples.size()) * 0.01, 2); + recordIdx = 0; + + // Filter specific header lines + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); + header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); + + // Write header + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(header); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + // Exit if outside batch range + if (recordIdx < recordStart || recordIdx >= recordEnd) { + recordIdx++; + return; + } + recordIdx++; + + // Initialize data structures + VariantContextBuilder builder = new VariantContextBuilder(variant); + List genotypes = variant.getGenotypes(); + + // Process variants + genotypes = processRevisedCn(variant, genotypes); + processMultiallelic(builder, genotypes); + genotypes = processLargeDeletions(variant, builder, genotypes); + genotypes = processLargeDuplications(variant, builder, genotypes); + genotypes = processRevisedSex(variant, genotypes); + + // Build genotypes + if (isCalled(builder, genotypes)) { + builder.genotypes(genotypes); + vcfWriter.add(builder.make()); + } + } + + private List processRevisedCn(final VariantContext variant, final List genotypes) { + final String variantID = variant.getID(); + if (!revisedCopyNumbers.containsKey(variantID)) { + return genotypes; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { + String sampleName = genotype.getSampleName(); + if (revisedCopyNumbers.get(variantID).containsKey(sampleName)) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + gb.attribute(GATKSVVCFConstants.RD_CN, revisedCopyNumbers.get(variantID).get(sampleName)); + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); + } + } + return updatedGenotypes; + } + + private void processMultiallelic(final VariantContextBuilder builder, final List genotypes) { + int numGtOver2 = 0; + for (Genotype genotype : genotypes) { + Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; + Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GT).toString()) : null; + int gt; + if (peGt == null) { + continue; + } else if (srGt == null) { + gt = peGt; + } else if (peGt > 0 && srGt == 0) { + gt = peGt; + } else if (peGt == 0) { + gt = srGt; + } else { + Integer peGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GQ).toString()) : null; + Integer srGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GQ).toString()) : null; + if (peGq != null && srGq != null && peGq >= srGq) { + gt = peGt; + } else { + gt = srGt; + } + } + if (gt > 2) { + numGtOver2++; + } + } + if (numGtOver2 > maxVF) { + builder.attribute(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, true); + } + } + + private List processLargeDeletions(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { + if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + return genotypes; + } + + boolean multiallelicFilter = false; + if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { + Map sampleRdCn = new HashMap<>(); + for (Genotype genotype : genotypes) { + if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + if (sampleRdCn.values().stream().filter(value -> value > 3).count() > maxVF) { + multiallelicFilter = true; + } + } + + boolean gt5kbFilter = false; + List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { + gt5kbFilter = true; + } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + gt5kbFilter = true; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + if (gt5kbFilter) { + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + if (!genotype.isNoCall()) { + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 1) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else if (genotype.hasGQ()) { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } + } + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + } + + updatedGenotypes = new ArrayList<>(genotypes.size()); + if (multiallelicFilter) { + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.noGQ(); + gb.alleles(Arrays.asList(Allele.NO_CALL)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); + } + + return genotypes; + } + + private List processLargeDuplications(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { + if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) { + return genotypes; + } + + boolean multiallelicFilter = false; + if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { + Map sampleRdCn = new HashMap<>(); + for (Genotype genotype : genotypes) { + if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + if (sampleRdCn.values().stream().filter(value -> value > 4).count() > maxVF) { + multiallelicFilter = true; + } + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > 4) { + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).distinct().count() > maxVF) { + multiallelicFilter = true; + } + } + } + + boolean gt5kbFilter = false; + List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { + gt5kbFilter = true; + } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + gt5kbFilter = true; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + if (gt5kbFilter) { + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + if (!genotype.isNoCall()) { + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 3).toString()) <= 2) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 3) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else if (genotype.hasGQ()) { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } + } + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + } + + updatedGenotypes = new ArrayList<>(genotypes.size()); + if (multiallelicFilter) { + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.noGQ(); + gb.alleles(Arrays.asList(Allele.NO_CALL)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); + } + + return genotypes; + } + + private List processRevisedSex(final VariantContext variant, List genotypes) { + if (!variant.getAttributeAsBoolean(GATKSVVCFConstants.REVISED_EVENT, false)) { + return genotypes; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { + if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) > 0) { + int newRdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()) - 1; + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.attribute(GATKSVVCFConstants.RD_CN, newRdCn); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) { + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, newRdCn); + } + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); + } + } + return updatedGenotypes; + } + + public boolean isCalled(final VariantContextBuilder builder, final List genotypes) { + for (Genotype genotype : genotypes) { + if (!isNoCallGt(genotype.getAlleles())) { + return true; + } + } + + if (builder.getAttributes().getOrDefault(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.CNV)) { + for (Genotype genotype : genotypes) { + final int cn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 2).toString()); + if (cn != 2) { + return true; + } + } + } + + return false; + } + + private boolean isNoCallGt(List alleles) { + if (alleles.size() == 1 && alleles.get(0).isReference()) return true; + else if (alleles.size() == 2 && alleles.get(0).isReference() && alleles.get(1).isReference()) return true; + else if (alleles.size() == 1 && alleles.get(0).isNoCall()) return true; + return false; + } + + private Map> readRevisedEvents(final GATKPath filePath) { + try (BufferedReader reader = new BufferedReader(new FileReader(filePath.toPath().toFile()))) { + final Map> result = new HashMap<>(); + String line; + while ((line = reader.readLine()) != null) { + String[] fields = line.split("\t"); + if (fields.length < 3) continue; + + String variantId = fields[0]; + String sampleId = fields[1]; + int rdCn = Integer.parseInt(fields[2]); + + result.computeIfAbsent(variantId, k -> new HashMap<>()).put(sampleId, rdCn); + } + return result; + } catch (IOException e) { + throw new RuntimeException("Error reading input file", e); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java new file mode 100644 index 00000000000..92871c5f168 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -0,0 +1,243 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; + +import htsjdk.variant.vcf.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Cleansed VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVCleanPt5 extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + private final List overlappingVariantsBuffer = new ArrayList<>(); + private final Set filteredVariantIds = new HashSet<>(); + + @Override + protected int numberOfPasses() { return 2; } + + @Override + public void onTraversalStart() { + // Remove unnecessary header lines + final Set newHeaderLines = new HashSet<>(); + final VCFHeader header = getHeaderForVariants(); + for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { + if (line instanceof VCFInfoHeaderLine) { + if (GATKSVVCFConstants.FILTER_VCF_INFO_LINES.contains(((VCFInfoHeaderLine) line).getID())) { + continue; + } + } else if (line instanceof VCFFormatHeaderLine) { + if (((VCFFormatHeaderLine) line).getID().equals(GATKSVVCFConstants.EV)) { + continue; + } + } else if (line instanceof VCFAltHeaderLine) { + if (((VCFAltHeaderLine) line).getID().equals(GATKSVVCFConstants.UNR)) { + continue; + } + } + if (GATKSVVCFConstants.FILTER_VCF_LINES.stream().anyMatch(line.toString()::contains)) { + continue; + } + newHeaderLines.add(line); + } + + // Add new header lines + VCFHeader newHeader = new VCFHeader(newHeaderLines, header.getGenotypeSamples()); + newHeader.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.EV, 1, VCFHeaderLineType.String, "Classes of evidence supporting final genotype")); + + // Write header + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(newHeader); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + } + } + + @Override + protected void afterNthPass(int n) {} + + public void firstPassApply(final VariantContext variant) { + if (!variant.getFilters().contains(GATKSVVCFConstants.MULTIALLELIC)) { + return; + } + + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processVariantPair(bufferedVariant, variant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + if (filteredVariantIds.contains(variant.getID())) { + return; + } + + VariantContextBuilder builder = new VariantContextBuilder(variant); + processSvType(variant, builder); + cleanseInfoFields(builder); + vcfWriter.add(builder.make()); + } + + private void processVariantPair(VariantContext v1, VariantContext v2) { + // Determine larger variant + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int smallerLength = length2; + + // Swap variants if necessary + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + smallerLength = length1; + } + + // Calculate overlap + int minEnd = Math.min( + largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), + smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) + ); + int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + int overlapLength = minEnd - maxStart + 1; + if (overlapLength <= 0) { + return; + } + + // Filter variant based on conditions + double coverage = (double) overlapLength / smallerLength; + if (coverage > 0.5 && !filteredVariantIds.contains(largerVariant.getID())) { + filteredVariantIds.add(smallerVariant.getID()); + } + } + + private void processSvType(final VariantContext variant, final VariantContextBuilder builder) { + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); + boolean hasMobileElement = variant.getAlleles().stream() + .map(GATKSVVariantContextUtils::getSymbolicAlleleSymbols) + .flatMap(Arrays::stream) + .anyMatch(symbol -> symbol.equals(GATKSVVCFConstants.ME)); + if (svType == null || hasMobileElement) { + return; + } + + final Allele refAllele = variant.getReference(); + final Allele altAllele = Allele.create("<" + svType + ">", false); + List newAlleles = Arrays.asList(refAllele, altAllele); + + List genotypes = variant.getGenotypes(); + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + long altCount = genotype.getAlleles().stream().filter(allele -> allele.isCalled() && !allele.isReference()).count(); + if (altCount == 1) { // Heterozygous (0/1) + gb.alleles(Arrays.asList(refAllele, altAllele)); + } else if (altCount == 2) { // Homozygous Alternate (1/1) + gb.alleles(Arrays.asList(altAllele, altAllele)); + } + updatedGenotypes.add(gb.make()); + } + + builder.alleles(newAlleles); + builder.genotypes(updatedGenotypes); + } + + private void cleanseInfoFields(final VariantContextBuilder builder) { + Map attributes = builder.getAttributes(); + for (String field : GATKSVVCFConstants.FILTER_VCF_INFO_LINES) { + if (attributes.containsKey(field)) { + builder.rmAttribute(field); + } + } + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java new file mode 100644 index 00000000000..fecb8860445 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java @@ -0,0 +1,111 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.List; +import java.util.ArrayList; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Cleansed VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseAbnormalAllosomes extends VariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(getHeaderForVariants()); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + public void apply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext) { + final VariantContextBuilder builder = new VariantContextBuilder(variant); + if (variant.getAttributeAsBoolean(GATKSVVCFConstants.REVISED_EVENT, false)) { + processRevisedSex(variant, builder); + } + vcfWriter.add(builder.make()); + } + + private void processRevisedSex(final VariantContext variant, final VariantContextBuilder builder) { + final List genotypes = variant.getGenotypes(); + final List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (final Genotype genotype : genotypes) { + if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) > 0) { + int newRdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()) - 1; + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.attribute(GATKSVVCFConstants.RD_CN, newRdCn); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) { + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, newRdCn); + } + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); + } + } + builder.genotypes(genotypes); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java new file mode 100644 index 00000000000..4e4c83652e7 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java @@ -0,0 +1,347 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFFilterHeaderLine; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.io.IOException; +import java.nio.file.Files; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Cleansed VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseLargeCnvs extends VariantWalker { + public static final String OUTLIERS_LIST_LONG_NAME = "outliers-list"; + + @Argument( + fullName = OUTLIERS_LIST_LONG_NAME, + doc = "File with outlier samples", + optional = true + ) + private GATKPath outliersListPath; + + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + private Set outlierSamples; + + private double maxVF; + + private static final int MIN_LARGE_EVENT_SIZE = 1000; + private static final int MIN_MULTIALLELIC_EVENT_SIZE = 5000; + + @Override + public void onTraversalStart() { + // Read and parse input files + try { + outlierSamples = new HashSet<>(); + if (outliersListPath != null) { + outlierSamples = new HashSet<>(Files.readAllLines(outliersListPath.toPath())); + } + } catch (IOException e) { + throw new RuntimeException("Error reading input file", e); + } + + // Populate maxVf based on sample information + maxVF = Math.max((getHeaderForVariants().getGenotypeSamples().size() - outlierSamples.size()) * 0.01, 2); + + // Filter specific header lines + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); + header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); + + // Write header + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(header); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + // Initialize data structures + final VariantContextBuilder builder = new VariantContextBuilder(variant); + List genotypes = variant.getGenotypes(); + + // Process variants + processMultiallelic(builder, genotypes); + genotypes = processLargeDeletions(variant, builder, genotypes); + genotypes = processLargeDuplications(variant, builder, genotypes); + + // Build genotypes + if (isCalled(builder, genotypes)) { + builder.genotypes(genotypes); + vcfWriter.add(builder.make()); + } + } + + private void processMultiallelic(final VariantContextBuilder builder, final List genotypes) { + int numGtOver2 = 0; + for (final Genotype genotype : genotypes) { + final Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; + final Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GT).toString()) : null; + int gt; + if (peGt == null) { + continue; + } else if (srGt == null) { + gt = peGt; + } else if (peGt > 0 && srGt == 0) { + gt = peGt; + } else if (peGt == 0) { + gt = srGt; + } else { + final Integer peGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GQ).toString()) : null; + final Integer srGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GQ).toString()) : null; + if (peGq != null && srGq != null && peGq >= srGq) { + gt = peGt; + } else { + gt = srGt; + } + } + if (gt > 2) { + numGtOver2 += 1; + } + } + if (numGtOver2 > maxVF) { + builder.attribute(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, true); + } + } + + private List processLargeDeletions(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { + if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + return genotypes; + } + + boolean multiallelicFilter = false; + if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { + Map sampleRdCn = new HashMap<>(); + for (final Genotype genotype : genotypes) { + final String sample = genotype.getSampleName(); + if (!outlierSamples.contains(sample) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + if (sampleRdCn.values().stream().filter(value -> value > 3).count() > maxVF) { + multiallelicFilter = true; + } + } + + boolean gt5kbFilter = false; + final List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { + gt5kbFilter = true; + } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + gt5kbFilter = true; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + if (gt5kbFilter) { + for (final Genotype genotype : genotypes) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + if (!genotype.isNoCall()) { + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 1) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else if (genotype.hasGQ()) { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } + } + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + } + + updatedGenotypes = new ArrayList<>(genotypes.size()); + if (multiallelicFilter) { + for (final Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.noGQ(); + gb.alleles(Arrays.asList(Allele.NO_CALL)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); + } + + return genotypes; + } + + private List processLargeDuplications(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { + if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) { + return genotypes; + } + + boolean multiallelicFilter = false; + if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { + Map sampleRdCn = new HashMap<>(); + for (final Genotype genotype : genotypes) { + final String sample = genotype.getSampleName(); + if (!outlierSamples.contains(sample) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + if (sampleRdCn.values().stream().filter(value -> value > 4).count() > maxVF) { + multiallelicFilter = true; + } + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > 4) { + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).distinct().count() > maxVF) { + multiallelicFilter = true; + } + } + } + + boolean gt5kbFilter = false; + final List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { + gt5kbFilter = true; + } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + gt5kbFilter = true; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + if (gt5kbFilter) { + for (final Genotype genotype : genotypes) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + if (!genotype.isNoCall()) { + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 3).toString()) <= 2) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 3) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else if (genotype.hasGQ()) { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } + } + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + } + + updatedGenotypes = new ArrayList<>(genotypes.size()); + if (multiallelicFilter) { + for (final Genotype genotype : genotypes) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.noGQ(); + gb.alleles(Arrays.asList(Allele.NO_CALL)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); + } + + return genotypes; + } + + public boolean isCalled(final VariantContextBuilder builder, final List genotypes) { + for (final Genotype genotype : genotypes) { + if (!isNoCallGt(genotype.getAlleles())) { + return true; + } + } + + if (builder.getAttributes().getOrDefault(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.CNV)) { + for (final Genotype genotype : genotypes) { + if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 2).toString()) != 2) { + return true; + } + } + } + + return false; + } + + private boolean isNoCallGt(final List alleles) { + if (alleles.size() == 1 && alleles.get(0).isReference()) return true; + else if (alleles.size() == 2 && alleles.get(0).isReference() && alleles.get(1).isReference()) return true; + else if (alleles.size() == 1 && alleles.get(0).isNoCall()) return true; + return false; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java new file mode 100644 index 00000000000..25066ab0ec6 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java @@ -0,0 +1,170 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.HashSet; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Cleansed VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseMultiallelicCnvs extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + private final List overlappingVariantsBuffer = new ArrayList<>(); + private final Set filteredVariantIds = new HashSet<>(); + + @Override + protected int numberOfPasses() { return 2; } + + @Override + protected void afterNthPass(int n) {} + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(getHeaderForVariants()); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + } + } + + public void firstPassApply(final VariantContext variant) { + if (!variant.getFilters().contains(GATKSVVCFConstants.MULTIALLELIC)) { + return; + } + + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processVariantPair(bufferedVariant, variant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + if (filteredVariantIds.contains(variant.getID())) { + return; + } + + final VariantContextBuilder builder = new VariantContextBuilder(variant); + vcfWriter.add(builder.make()); + } + + private void processVariantPair(final VariantContext v1, final VariantContext v2) { + // Determine larger variant, swapping if necessary + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + } + + // Skip if coverage below expected + final double coverage = getCoverage(largerVariant, smallerVariant); + if (coverage < 0.5) { + return; + } + + // Filter variant based on conditions + if (!filteredVariantIds.contains(largerVariant.getID())) { + filteredVariantIds.add(smallerVariant.getID()); + } + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } + + private double getCoverage(final VariantContext larger, final VariantContext smaller) { + final int largerStart = larger.getStart(); + final int smallerStart = smaller.getStart(); + final int largerStop = larger.getEnd(); + final int smallerStop = smaller.getEnd(); + + if (largerStart <= smallerStop && smallerStart <= largerStop) { + final int intersectionSize = Math.min(smallerStop, largerStop) - Math.max(smallerStart, largerStart) + 1; + return (double) intersectionSize / (smallerStop - smallerStart + 1); + } + return 0.0; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java new file mode 100644 index 00000000000..f7e607a17a5 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java @@ -0,0 +1,340 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; +import java.util.Collections; + +/** + * Completes a series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * TODO + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseOverlappingCnvCns extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + // Data structures to hold accumulated data across variants + private static final List overlappingVariantsBuffer = new ArrayList<>(); + + private static final Map> abnormalRdCn = new HashMap<>(); + private static final Map> revisedCopyNumbers = new HashMap<>(); + private static final Set revisedComplete = new HashSet<>(); + + private static final int MIN_VARIANT_SIZE = 5000; + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(getHeaderForVariants()); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected int numberOfPasses() { return 2; } + + @Override + protected void afterNthPass(int n) {} + + @Override + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + } + } + + public void firstPassApply(final VariantContext variant) { + // Skip if not expected SVTYPE or below SVLEN threshold + if (!isDelDup(variant) || !isLarge(variant, MIN_VARIANT_SIZE)) { + return; + } + + // Flag sample as having an abnormal copy number if it passes certain conditions + for (final String sample : variant.getSampleNames()) { + final Genotype genotype = variant.getGenotype(sample); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && rdCn < 2) || (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && rdCn > 2)) { + abnormalRdCn.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(sample); + } + } + + // Process overlaps with variants in the buffer + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(variant, bufferedVariant)) { + adjustCopyNumber(variant, bufferedVariant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + VariantContextBuilder builder = new VariantContextBuilder(variant); + if (revisedCopyNumbers.containsKey(variant.getID())) { + processRevisedCn(builder, variant); + } + vcfWriter.add(builder.make()); + } + + private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) { + // Determine larger variant + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + int largerLength = length1; + int smallerLength = length2; + + // Swap variants if necessary + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + largerLength = length2; + smallerLength = length1; + } + + // Get variant attributes + final String variantId1 = largerVariant.getID(); + final String variantId2 = smallerVariant.getID(); + final Map variantRdCn1 = getRdCnForVariant(largerVariant); + final Map variantRdCn2 = getRdCnForVariant(smallerVariant); + final Map> variantSupport1 = getSupportForVariant(largerVariant); + final Map> variantSupport2 = getSupportForVariant(smallerVariant); + final String svType1 = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String svType2 = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + + // Calculate overlap + final int minEnd = Math.min( + largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), + smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) + ); + final int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + final int lengthOverlap = minEnd - maxStart + 1; + final double overlap1 = (double) lengthOverlap / (double) largerLength; + final double overlap2 = (double) lengthOverlap / (double) smallerLength; + + // Get samples with abnormal CN across both variants + final Set samples = new HashSet<>(abnormalRdCn.getOrDefault(variantId1, Collections.emptySet())); + samples.retainAll(abnormalRdCn.getOrDefault(variantId2, Collections.emptySet())); + + // Iterate through samples to test against conditions + for (String sample : samples) { + final String id1 = variantId1 + "@" + sample; + final String id2 = variantId2 + "@" + sample; + if (revisedComplete.contains(id1)) { + continue; + } + + // Initialize variables for evaluation + final int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); + final int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); + final Set support1 = variantSupport1.get(sample); + final Set support2 = variantSupport2.get(sample); + final Genotype genotype2 = smallerVariant.getGenotype(sample); + + // Condition 1: Smaller depth call is being driven by larger call + if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV)) { + if (rdCn1 == 0) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1) { + makeRevision(id2, rdCn2 + rdCn1); + } else if (rdCn1 > 1) { + int newCN = rdCn2 - rdCn1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } + } + + // Condition 2: Smaller CNV is driven by larger CNV genotype + else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 + && overlap1 > 0.5 && overlap2 > 0.5 && !smallerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) + && !genotype2.isHomRef()) { + if (rdCn2 == 0) { + makeRevision(id1, rdCn1 + 2); + } else if (rdCn2 == 1) { + makeRevision(id1, rdCn1 + rdCn2); + } else if (rdCn2 > 1) { + int newCN = rdCn1 - rdCn2 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id1, newCN); + } + } + + // Condition 3: Depth-only calls where smaller call is driven by larger call + else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && svType1.equals(svType2)) { + if (rdCn1 == 0 && rdCn1 != rdCn2) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1 && rdCn1 > rdCn2) { + makeRevision(id2, 1); + } else if (rdCn1 > 1 && rdCn1 < rdCn2) { + makeRevision(id2, Math.max(rdCn2 - rdCn1 + 2, 0)); + } else { + makeRevision(id2, 2); + } + } + + // Condition 4: Any other time a larger call drives a smaller call + else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && largerLength > MIN_VARIANT_SIZE) { + if (rdCn1 == 0) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1) { + makeRevision(id2, rdCn2 + rdCn1); + } else if (rdCn1 > 1) { + int newCN = rdCn2 - rdCn1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } + } + } + } + + private void makeRevision(final String id, final int val) { + final String[] tokens = id.split("@"); + final String variantId = tokens[0]; + final String sample = tokens[1]; + revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); + if (val == 2) { + revisedComplete.add(id); + } + } + + private void processRevisedCn(final VariantContextBuilder builder, final VariantContext variant) { + // Initialize data structures + final String variantID = variant.getID(); + final List genotypes = builder.getGenotypes(); + final List updatedGenotypes = new ArrayList<>(genotypes.size()); + + // Replace revised alleles and copy numbers + for (final Genotype genotype : genotypes) { + final String sampleName = genotype.getSampleName(); + if (revisedCopyNumbers.get(variantID).containsKey(sampleName)) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + gb.attribute(GATKSVVCFConstants.RD_CN, revisedCopyNumbers.get(variantID).get(sampleName)); + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); + } + } + builder.genotypes(updatedGenotypes); + } + + private Map> getSupportForVariant(final VariantContext variant) { + Map> supportMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + final Genotype genotype = variant.getGenotype(sample); + final String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() : ""; + final Set supportSet = new HashSet<>(); + if (!supportStr.isEmpty()) { + supportSet.addAll(Arrays.asList(supportStr.split(","))); + } + supportMap.put(sample, supportSet); + } + return supportMap; + } + + private Map getRdCnForVariant(final VariantContext variant) { + final Map rdCnMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + final Genotype genotype = variant.getGenotype(sample); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + rdCnMap.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + return rdCnMap; + } + + private boolean isDelDup(final VariantContext variant) { + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); + } + + private boolean isLarge(final VariantContext variant, final int minSize) { + final int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + return variantLength >= minSize; + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java new file mode 100644 index 00000000000..6f8f4eb7dbf --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java @@ -0,0 +1,330 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; +import org.apache.commons.lang3.tuple.Pair; + +/** + * Completes a series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * TODO + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseOverlappingCnvGts extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + // Data structures to hold accumulated data across variants + private static final List overlappingVariantsBuffer = new ArrayList<>(); + + private static final Map>> revisedEventsAll = new HashMap<>(); + private static final Map> revisedEventsFiltered = new HashMap<>(); + private static final Map> currentCopyNumbers = new HashMap<>(); + + private static final int MIN_VARIANT_SIZE = 5000; + + @Override + protected int numberOfPasses() { return 3; } + + @Override + protected void afterNthPass(final int n) { + if (n == 0) { + processCollectedVariants(); + } + } + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.MULTI_CNV, 0, VCFHeaderLineType.Flag, "Variant is a multiallelic CNV")); + vcfWriter.writeHeader(header); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext, final int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + case 2: + thirdPassApply(variant); + break; + } + } + + public void firstPassApply(final VariantContext variant) { + if (!isDelDup(variant) || !isLarge(variant, MIN_VARIANT_SIZE)) { + return; + } + + // Process overlaps with variants in the buffer + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processOverlap(bufferedVariant, variant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + if (!revisedEventsFiltered.containsKey(variant.getID())) { + return; + } + + // Initialize data structures + final String variantId = variant.getID(); + final Set samples = revisedEventsFiltered.get(variantId); + final Map variantRdCn = new HashMap<>(); + + // Initialize revisedRdCn value for each variant + for (final String sampleName : samples) { + final Genotype genotype = variant.getGenotype(sampleName); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + variantRdCn.put(sampleName, rdCn); + } + currentCopyNumbers.put(variantId, variantRdCn); + } + + public void thirdPassApply(final VariantContext variant) { + final VariantContextBuilder builder = new VariantContextBuilder(variant); + if (revisedEventsAll.containsKey(variant.getID())) { + processRevisedEvent(builder, variant); + } + if (isDelDup(variant)) { + processCnvs(builder, variant); + } + vcfWriter.add(builder.make()); + } + + private void processCollectedVariants() { + // Prune variant-sample pairs we need RD_CN values for + for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { + for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { + final String sampleName = innerEntry.getKey(); + final String variantId = entry.getKey(); + final String widerVariantId = innerEntry.getValue().getLeft(); + final String svType = innerEntry.getValue().getRight(); + if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); + revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); + } + } + } + } + + private void processOverlap(final VariantContext v1, final VariantContext v2) { + // Get overlap data + VariantContext wider; + VariantContext narrower; + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + if (length1 > length2) { + wider = v1; + narrower = v2; + } else if (length2 > length1) { + wider = v2; + narrower = v1; + } else { + return; + } + final String widerID = wider.getID(); + final String narrowerID = narrower.getID(); + + // Skip processing if same variant ID, SV type or samples + final String widerSvType = wider.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String narrowerSvType = narrower.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final Set widerSamples = getNonReferenceSamples(wider); + final Set narrowerSamples = getNonReferenceSamples(narrower); + if (widerID.equals(narrowerID) || widerSvType.equals(narrowerSvType) || widerSamples.equals(narrowerSamples)) { + return; + } + + // Get samples present in wider but not in narrower + final Set nonCommonSamples = new HashSet<>(widerSamples); + nonCommonSamples.removeAll(narrowerSamples); + if (nonCommonSamples.isEmpty()) { + return; + } + + // Revise variant if coverage exceeds threshold + final double coverage = getCoverage(wider, narrower); + if (coverage >= 0.5) { + for (final String sample : nonCommonSamples) { + revisedEventsAll.computeIfAbsent(narrowerID, k -> new HashMap<>()) + .put(sample, new ImmutablePair<>(widerID, widerSvType)); + } + } + } + + private void processRevisedEvent(final VariantContextBuilder builder, final VariantContext variant) { + // Initialize data structures + final String variantId = variant.getID(); + final Map> variantEvents = revisedEventsAll.get(variantId); + final List newGenotypes = new ArrayList<>(); + + // Create updated genotypes + for (String sample : variant.getSampleNamesOrderedByName()) { + final Genotype oldGenotype = variant.getGenotype(sample); + final Pair event = variantEvents.get(sample); + + if (event != null) { + final String widerVariantId = event.getLeft(); + final String widerSvType = event.getRight(); + final int currentRdCn = currentCopyNumbers.get(variantId).getOrDefault(sample, 0); + final int widerRdCn = currentCopyNumbers.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sample, 0); + + int newVal = -1; + if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { + newVal = 1; + } else if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && widerRdCn == 1) { + newVal = 3; + } + + if (newVal != -1) { + final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + if (!oldGenotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + gb.GQ(rdCn); + newGenotypes.add(gb.make()); + } else { + newGenotypes.add(oldGenotype); + } + } else { + newGenotypes.add(oldGenotype); + } + } + builder.genotypes(newGenotypes); + } + + private void processCnvs(final VariantContextBuilder builder, final VariantContext variant) { + final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + for (String sample : variant.getSampleNamesOrderedByName()) { + final Genotype genotype = variant.getGenotype(sample); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { + builder.attribute(GATKSVVCFConstants.MULTI_CNV, true); + break; + } + } + } + + private boolean isDelDup(final VariantContext variant) { + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); + } + + private boolean isLarge(final VariantContext variant, final int minSize) { + final int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + return variantLength >= minSize; + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } + + private Set getNonReferenceSamples(final VariantContext variant) { + final Set samples = new HashSet<>(); + for (final String sampleName : variant.getSampleNames()) { + final Genotype genotype = variant.getGenotype(sampleName); + if (genotype.isCalled() && !genotype.isHomRef()) { + samples.add(sampleName); + } + } + return samples; + } + + private double getCoverage(final VariantContext wider, final VariantContext narrower) { + final int nStart = narrower.getStart(); + final int nStop = narrower.getEnd(); + final int wStart = wider.getStart(); + final int wStop = wider.getEnd(); + + if (wStart <= nStop && nStart <= wStop) { + final int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart) + 1; + return (double) intersectionSize / (nStop - nStart + 1); + } + return 0.0; + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java new file mode 100644 index 00000000000..39fa7ae19aa --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java @@ -0,0 +1,589 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.*; + +/** + * Completes a series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * TODO + *
  • + *
+ * + *

Usage Example

+ *
+ *     TODO
+ * 
+ * + *

Processing Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseOverlappingCnvs extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + // Data structure for overlap detection + private static final List overlappingVariantsBuffer = new ArrayList<>(); + + // Data structure for mutliallelic CNV detection + private static final Set multiCnvs = new HashSet<>(); + + // Data structures for revising genotypes + private static final Map>> revisedEventsAll = new HashMap<>(); + private static final Map> revisedEventsFiltered = new HashMap<>(); + private static final Map> currentCopyNumbers = new HashMap<>(); + + // Data structures for revising copy numbers + private static final Map> abnormalRdCn = new HashMap<>(); + private static final Map> revisedCopyNumbers = new HashMap<>(); + private static final Set revisedComplete = new HashSet<>(); + + // Data structures for cached data + private final Map> nonRefSamplesCache = new HashMap<>(); + private final Map>> supportCache = new HashMap<>(); + private final Map> rdCnCache = new HashMap<>(); + + private static final int MIN_VARIANT_SIZE = 5000; + + @Override + protected int numberOfPasses() { return 3; } + + @Override + protected void afterNthPass(final int n) { + if (n == 0) { + processCollectedVariants(); + clearAllCaches(); + } + } + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.MULTI_CNV, 0, VCFHeaderLineType.Flag, "Variant is a multiallelic CNV")); + vcfWriter.writeHeader(header); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext, final int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + case 2: + thirdPassApply(variant); + break; + } + } + + public void firstPassApply(final VariantContext variant) { + // Skip processing if not CNV + if (!isDelDup(variant)) { + return; + } + + // Flag variant as being a multiallelic CNV + final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + for (final Genotype genotype : variant.getGenotypes()) { + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { + multiCnvs.add(variant.getID()); + break; + } + } + + // Skip processing if below size threshold + if (!isLarge(variant, MIN_VARIANT_SIZE)) { + return; + } + + // Flag sample as having an abnormal copy number + for (final Genotype genotype : variant.getGenotypes()) { + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && rdCn < 2) || (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && rdCn > 2)) { + abnormalRdCn.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(genotype.getSampleName()); + } + } + + // Remove variants not in current context window + overlappingVariantsBuffer.removeIf(overlappingVariant -> { + boolean shouldRemove = !overlappingVariant.getContig().equals(variant.getContig()) + || (overlappingVariant.getStart() + overlappingVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart(); + if (shouldRemove) removeVariantFromCaches(overlappingVariant.getID()); + return shouldRemove; + }); + + // Process overlaps with variants in the buffer + for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processGt(bufferedVariant, variant); + processCn(bufferedVariant, variant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + // Skip processing if not in revised events map + if (!revisedEventsFiltered.containsKey(variant.getID())) { + return; + } + + // Initialize data structures + final String variantId = variant.getID(); + final Set samples = revisedEventsFiltered.get(variantId); + final Map variantRdCn = new HashMap<>(); + + // Initialize revisedRdCn values for each variant + for (final String sampleName : samples) { + final Genotype genotype = variant.getGenotype(sampleName); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + variantRdCn.put(sampleName, rdCn); + } + currentCopyNumbers.put(variantId, variantRdCn); + } + + public void thirdPassApply(final VariantContext variant) { + final VariantContextBuilder builder = new VariantContextBuilder(variant); + + // Revise genotypes + if (revisedEventsAll.containsKey(variant.getID())) { + processRevisedGt(builder, variant); + } + + // Revise copy numbers + if (revisedCopyNumbers.containsKey(variant.getID())) { + processRevisedCn(builder, variant); + } + + // Tag multiallelic CNVs + if (multiCnvs.contains((variant.getID()))) { + builder.attribute(GATKSVVCFConstants.MULTI_CNV, true); + } + + vcfWriter.add(builder.make()); + } + + private void processGt(final VariantContext v1, final VariantContext v2) { + // Determine larger variant, swapping if necessary + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + } + + // Skip if coverage below expected + final double coverage = getCoverage(largerVariant, smallerVariant); + if (coverage < 0.5) { + return; + } + + // Skip if same variant ID, SV type or sample sets + final String largerId = largerVariant.getID(); + final String smallerId = smallerVariant.getID(); + final String largerSvType = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String smallerSvType = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final Set largerSamples = getNonReferenceSamples(largerVariant); + final Set smallerSamples = getNonReferenceSamples(smallerVariant); + if (largerId.equals(smallerId) || largerSvType.equals(smallerSvType) || largerSamples.equals(smallerSamples)) { + return; + } + + // Skip if no non-overlapping samples + final Set nonCommonSamples = new HashSet<>(largerSamples); + nonCommonSamples.removeAll(smallerSamples); + if (nonCommonSamples.isEmpty()) { + return; + } + + // Add variant pair to data structure + for (final String sample : nonCommonSamples) { + revisedEventsAll.computeIfAbsent(smallerId, k -> new HashMap<>()) + .put(sample, new ImmutablePair<>(largerId, largerSvType)); + } + } + + private void processCn(final VariantContext v1, final VariantContext v2) { + // Determine larger variant, swapping if necessary + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + length1 = length2; + } + + // Calculate overlap + final double coverage = getCoverage(largerVariant, smallerVariant); + if (coverage < 0.5) { + return; + } + + // Skip if no common abnormal samples + final Set samples = new HashSet<>(abnormalRdCn.getOrDefault(largerVariant.getID(), Collections.emptySet())); + samples.retainAll(abnormalRdCn.getOrDefault(smallerVariant.getID(), Collections.emptySet())); + if (samples.isEmpty()) { + return; + } + + // Cached non-boolean fields + final String largerId = largerVariant.getID(); + final String smallerId = smallerVariant.getID(); + final Map largerRdCn = getRdCn(largerVariant); + final Map smallerRdCn = getRdCn(smallerVariant); + final Map> largerSupport = getSupport(largerVariant); + final Map> smallerSupport = getSupport(smallerVariant); + final String largerSvType = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String smallerSvType = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + + // Cached length fields + final int minEnd = Math.min( + largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), + smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) + ); + final int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + final int lengthOverlap = minEnd - maxStart + 1; + final double largerOverlap = (double) lengthOverlap / (double) length1; + + // Cached boolean fields + final boolean largerIsMultiCnv = multiCnvs.contains(largerId); + final boolean smallerIsMultiCnv = multiCnvs.contains(smallerId); + final boolean isMatchingSvType = largerSvType.equals(smallerSvType); + final boolean isOverlapping = (largerOverlap > 0.5); + final boolean isLargerThanMin = length1 > MIN_VARIANT_SIZE; + + // Iterate through samples to test against conditions + for (final String sample : samples) { + final String largerFullId = largerId + "@" + sample; + final String smallerFullId = smallerId + "@" + sample; + if (revisedComplete.contains(largerFullId)) { + continue; + } + + // Initialize variables for evaluation + final int largerSampleRdCn = revisedCopyNumbers.getOrDefault(largerId, Collections.emptyMap()).getOrDefault(sample, largerRdCn.get(sample)); + final int smallerSampleRdCn = revisedCopyNumbers.getOrDefault(smallerId, Collections.emptyMap()).getOrDefault(sample, smallerRdCn.get(sample)); + final Set largerSampleSupport = largerSupport.get(sample); + final Set smallerSampleSupport = smallerSupport.get(sample); + final Genotype genotype2 = smallerVariant.getGenotype(sample); + + // Condition 1: Smaller depth call is being driven by larger call + if (largerSampleSupport.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && largerSampleSupport.size() > 1 + && smallerSampleSupport.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && !largerIsMultiCnv) { + if (largerSampleRdCn == 0) { + makeRevision(smallerFullId, smallerSampleRdCn + 2); + } else if (largerSampleRdCn == 1) { + makeRevision(smallerFullId, smallerSampleRdCn + largerSampleRdCn); + } else if (largerSampleRdCn > 1) { + int newCN = smallerSampleRdCn - largerSampleRdCn + 2; + newCN = Math.max(newCN, 0); + makeRevision(smallerFullId, newCN); + } + } + + // Condition 2: Smaller CNV is driven by larger CNV genotype + else if (smallerSampleSupport.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && smallerSampleSupport.size() > 1 + && largerSampleSupport.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && !genotype2.isHomRef() && !smallerIsMultiCnv && isOverlapping) { + if (smallerSampleRdCn == 0) { + makeRevision(largerFullId, largerSampleRdCn + 2); + } else if (smallerSampleRdCn == 1) { + makeRevision(largerFullId, largerSampleRdCn + smallerSampleRdCn); + } else if (smallerSampleRdCn > 1) { + int newCN = largerSampleRdCn - smallerSampleRdCn + 2; + newCN = Math.max(newCN, 0); + makeRevision(largerFullId, newCN); + } + } + + // Condition 3: Depth-only calls where smaller call is driven by larger call + else if (largerSampleSupport.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && smallerSampleSupport.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && !largerIsMultiCnv && isMatchingSvType) { + if (largerSampleRdCn == 0 && largerSampleRdCn != smallerSampleRdCn) { + makeRevision(smallerFullId, smallerSampleRdCn + 2); + } else if (largerSampleRdCn == 1 && largerSampleRdCn > smallerSampleRdCn) { + makeRevision(smallerFullId, 1); + } else if (largerSampleRdCn > 1 && largerSampleRdCn < smallerSampleRdCn) { + makeRevision(smallerFullId, Math.max(smallerSampleRdCn - largerSampleRdCn + 2, 0)); + } else { + makeRevision(smallerFullId, 2); + } + } + + // Condition 4: Any other time a larger call drives a smaller call + else if (largerSampleSupport.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && !largerIsMultiCnv && isLargerThanMin) { + if (largerSampleRdCn == 0) { + makeRevision(smallerFullId, smallerSampleRdCn + 2); + } else if (largerSampleRdCn == 1) { + makeRevision(smallerFullId, smallerSampleRdCn + largerSampleRdCn); + } else if (largerSampleRdCn > 1) { + int newCN = smallerSampleRdCn - largerSampleRdCn + 2; + newCN = Math.max(newCN, 0); + makeRevision(smallerFullId, newCN); + } + } + } + } + + private void processRevisedGt(final VariantContextBuilder builder, final VariantContext variant) { + // Initialize data structures + final String variantId = variant.getID(); + final Map> variantEvents = revisedEventsAll.get(variantId); + final Map revisedGenotypes = new HashMap<>(); + final List oldGenotypes = variant.getGenotypes(); + final List newGenotypes = new ArrayList<>(oldGenotypes.size()); + + // Populate genotypes that need revising + for (final Map.Entry> entry : variantEvents.entrySet()) { + final Pair event = entry.getValue(); + if (event == null) { + continue; + } + + final String sampleName = entry.getKey(); + final Genotype genotype = variant.getGenotype(sampleName); + final String largerId = event.getLeft(); + final String largerSvType = event.getRight(); + final int currentRdCn = currentCopyNumbers.get(variantId).getOrDefault(sampleName, 0); + final int largerRdCn = currentCopyNumbers.getOrDefault(largerId, new HashMap<>()).getOrDefault(sampleName, 0); + + int newVal = -1; + if (largerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && largerRdCn == 3) { + newVal = 1; + } else if (largerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && largerRdCn == 1) { + newVal = 3; + } + + if (newVal != -1) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + gb.GQ(rdCn); + revisedGenotypes.put(sampleName, gb.make()); + } + } + + // Populate genotypes that don't need revising + for (final Genotype genotype : oldGenotypes) { + final String sampleName = genotype.getSampleName(); + newGenotypes.add(revisedGenotypes.getOrDefault(sampleName, genotype)); + } + + builder.genotypes(newGenotypes); + } + + private void processRevisedCn(final VariantContextBuilder builder, final VariantContext variant) { + // Initialize data structures + final String variantId = variant.getID(); + final List genotypes = builder.getGenotypes(); + final List updatedGenotypes = new ArrayList<>(genotypes.size()); + + // Replace revised alleles and copy numbers + for (final Genotype genotype : genotypes) { + final String sampleName = genotype.getSampleName(); + if (revisedCopyNumbers.get(variantId).containsKey(sampleName)) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + gb.attribute(GATKSVVCFConstants.RD_CN, revisedCopyNumbers.get(variantId).get(sampleName)); + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); + } + } + builder.genotypes(updatedGenotypes); + } + + private void processCollectedVariants() { + // Prune variant-sample pairs we need RD_CN values for + for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { + for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { + final String sampleName = innerEntry.getKey(); + final String variantId = entry.getKey(); + final String largerVariantId = innerEntry.getValue().getLeft(); + final String largerSvType = innerEntry.getValue().getRight(); + if (largerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || largerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); + revisedEventsFiltered.computeIfAbsent(largerVariantId, k -> new HashSet<>()).add(sampleName); + } + } + } + } + + private Set getNonReferenceSamples(final VariantContext variant) { + final String variantId = variant.getID(); + if (nonRefSamplesCache.containsKey(variantId)) { + return nonRefSamplesCache.get(variantId); + } + + final Set samples = new HashSet<>(); + for (final Genotype genotype : variant.getGenotypes()) { + if (genotype.isCalled() && !genotype.isHomRef()) { + samples.add(genotype.getSampleName()); + } + } + + nonRefSamplesCache.put(variantId, samples); + return samples; + } + + private Map> getSupport(final VariantContext variant) { + final String variantId = variant.getID(); + if (supportCache.containsKey(variantId)) { + return supportCache.get(variantId); + } + + Map> supportMap = new HashMap<>(); + for (final Genotype genotype : variant.getGenotypes()) { + final String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) + ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() + : ""; + final Set supportSet = new HashSet<>(); + if (!supportStr.isEmpty()) { + supportSet.addAll(Arrays.asList(supportStr.split(","))); + } + supportMap.put(genotype.getSampleName(), supportSet); + } + + supportCache.put(variantId, supportMap); + return supportMap; + } + + private Map getRdCn(final VariantContext variant) { + final String variantId = variant.getID(); + if (rdCnCache.containsKey(variantId)) { + return rdCnCache.get(variantId); + } + + final Map rdCnMap = new HashMap<>(); + for (final Genotype genotype : variant.getGenotypes()) { + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + rdCnMap.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + + rdCnCache.put(variantId, rdCnMap); + return rdCnMap; + } + + private void clearAllCaches() { + nonRefSamplesCache.clear(); + supportCache.clear(); + rdCnCache.clear(); + } + + private void removeVariantFromCaches(final String variantID) { + nonRefSamplesCache.remove(variantID); + supportCache.remove(variantID); + rdCnCache.remove(variantID); + } + + private void makeRevision(final String id, final int val) { + final String[] tokens = id.split("@"); + final String variantId = tokens[0]; + final String sample = tokens[1]; + revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); + if (val == 2) { + revisedComplete.add(id); + } + } + + private boolean isDelDup(final VariantContext variant) { + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); + } + + private boolean isLarge(final VariantContext variant, final int minSize) { + final int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + return variantLength >= minSize; + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } + + private double getCoverage(final VariantContext larger, final VariantContext smaller) { + final int largerStart = larger.getStart(); + final int smallerStart = smaller.getStart(); + final int largerStop = larger.getEnd(); + final int smallerStop = smaller.getEnd(); + + if (largerStart <= smallerStop && smallerStart <= largerStop) { + final int intersectionSize = Math.min(smallerStop, largerStop) - Math.max(smallerStart, largerStart) + 1; + return (double) intersectionSize / (smallerStop - smallerStart + 1); + } + return 0.0; + } +} \ No newline at end of file