diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a85609e28..8bedf8e075 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ * Making faMT annotation for refSeq optional * Update RefSeq parser that does not run into null-pointer exceptions on mouse and rat genome (e.g. when no exon defines the gene name) * Fixing issue with block substitutions (#475). +* Fixing RefSeq build to properly assign transcript model (use best match with alignment) in the case of duplicates. +* Fixing issue in projection in the case of leading gaps (has no effect on CDS position prediction). +* Adding `TranscriptModel.getTrimmedSequence()` that removes leading and trailing (unaligned in RefSeq) sequence. ### jannovar-htsjdk diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java index 22635b7649..7723d64da3 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java @@ -86,7 +86,7 @@ protected NucleotideChange getCDSNTChange() { } catch (ProjectionException e) { throw new Error("Bug: at this point, the position must be a transcript position"); } - if (DuplicationChecker.isDuplication(transcript.getSequence(), change.getAlt(), txPos.getPos())) { + if (DuplicationChecker.isDuplication(transcript.getTrimmedSequence(), change.getAlt(), txPos.getPos())) { NucleotidePointLocationBuilder posBuilder = new NucleotidePointLocationBuilder(transcript); if (change.getAlt().length() == 1) { try { diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/SNVAnnotationBuilder.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/SNVAnnotationBuilder.java index a8288a9831..a36a3979a5 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/SNVAnnotationBuilder.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/SNVAnnotationBuilder.java @@ -76,8 +76,8 @@ private Annotation buildCDSExonicAnnotation() { // Check that the WT nucleotide from the transcript is consistent with change.ref and generate a warning message // if this is not the case. - if (txPos.getPos() >= transcript.getSequence().length() - || !transcript.getSequence().substring(txPos.getPos(), txPos.getPos() + 1).equals(change.getRef())) + if (txPos.getPos() >= transcript.getTrimmedSequence().length() + || !transcript.getTrimmedSequence().substring(txPos.getPos(), txPos.getPos() + 1).equals(change.getRef())) messages.add(AnnotationMessage.WARNING_REF_DOES_NOT_MATCH_TRANSCRIPT); // Compute the frame shift and codon start position. diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParser.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParser.java index c21e5a60ed..3089b9ad18 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParser.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParser.java @@ -119,7 +119,7 @@ public ImmutableList run() throws TranscriptParseException { // Load the FASTA file and assign to the builders. final String pathFASTA = PathUtil.join(basePath, getINIFileName("rna")); loadMitochondrialFASTA(builders); - + loadFASTA(builders, pathFASTA); // Create final list of TranscriptModels. @@ -158,13 +158,13 @@ private void loadMitochondrialFASTA(Map builders LOGGER.warn("Key for chrMT FASTA File does not exist, skipping."); return; } - + final String pathFasta = PathUtil.join(basePath, getINIFileName("faMT")); if (!new File(pathFasta).exists()) { LOGGER.warn("The chrMT FASTA File {} does not exist, skipping.", new Object[]{ pathFasta }); return; } - + final int idMT = refDict.getContigNameToID().get("chrMT"); @@ -303,7 +303,7 @@ private Map loadTranscriptModels(String pathGFF) Set wantedTypes = Sets.newHashSet("exon", "CDS", "stop_codon"); boolean onlyCurated = onlyCurated(); - + // Some exons do not have a gene name in mm9 and rat releases. // Therefore we select collect the name from the gene. Map entrezMap = new HashMap<>(); @@ -327,13 +327,13 @@ private Map loadTranscriptModels(String pathGFF) boolean isMT = contigDict.containsKey("chrMT") && contigDict.get("chrMT").equals(contigDict.get(record.getSeqID())); if ("gene".equals(record.getType())) { - + String entrezId = parseGeneID(record); String symbol = null; symbol = record.getAttributes().get("gene"); entrezMap.put(entrezId, symbol); - + // Register chrMT Entrez IDs to the "MT..." gene name. if (isMT) { if (record.getAttributes().get("gene_synonym") == null) { @@ -349,7 +349,7 @@ private Map loadTranscriptModels(String pathGFF) mtEntrezMap.put(entrezId, symbol); } } - + // Handle the records describing the transcript structure. // @@ -380,12 +380,12 @@ private Map loadTranscriptModels(String pathGFF) } String entrezId = parseGeneID(record); - + // // skip on chrMT some tRNA that do not have any Dbxref. This happens in mm9 and rat v6 // if (record.getAttributes().get("gbkey") == "tRNA" && entrezId == null) { // continue; // } - + transcriptId = mtEntrezMap.get(entrezId); builder.setAccession(transcriptId); } @@ -511,9 +511,9 @@ private TranscriptModelBuilder createNewTranscriptModelBuilder(FeatureRecord rec builder.setTxVersion(record.getAttributes().get("transcript_version")); String geneID = parseGeneID(record); builder.setGeneID(geneID); - + String gene = record.getAttributes().get("gene"); - + if (gene == null && geneID != null && notMtEntrezMap.containsKey(geneID)) { gene = notMtEntrezMap.get(geneID); } @@ -646,14 +646,15 @@ private Map mapNonRedundantTranscriptIdsToTransc } else { duplicatedTranscriptIdCount++; // This is a tricky call - there are about 30 transcripts with duplicated transcriptIds due to imperfect - // alignment to the genomic reference - LOGGER.warn("Transcript {} has {} possible transcript models - using the longest model", transcriptId, parentIds + // alignment to the genomic reference. We chose the one with the best match between alignment from cDNA_match + // and exons. + LOGGER.warn("Transcript {} has {} possible transcript models - picking best by cDNA_match", transcriptId, parentIds .size()); List possibleModels = parentIds.stream() .map(transcriptModelsWithTxRegion::get) .collect(toList()); - TranscriptModelBuilder longestCds = findLongestTranscriptRegion(possibleModels); - transcriptIdToTranscriptModelBuilders.put(transcriptId, longestCds); + TranscriptModelBuilder bestModel = findBestModelByCdnaMatch(possibleModels); + transcriptIdToTranscriptModelBuilders.put(transcriptId, bestModel); } } } @@ -663,13 +664,45 @@ private Map mapNonRedundantTranscriptIdsToTransc return transcriptIdToTranscriptModelBuilders; } - private TranscriptModelBuilder findLongestTranscriptRegion(List possibleModels) { - TranscriptModelBuilder longestCds = possibleModels.get(0); + private static int countCdnaMatches(TranscriptModelBuilder model) { + int result = 0; + int i = 0; + for (AlignmentPart part : model.getAlignmentParts()) { + if (i >= model.getExonRegions().size()) { + continue; + } + final GenomeInterval exon = model.getExonRegions().get(i).withStrand(Strand.FWD); + + final int partBegin; + final int partEnd; + + if (part.refBeginPos < part.refEndPos) { + partBegin = part.refBeginPos; + partEnd = part.refEndPos; + } else { + partEnd = part.refBeginPos; + partBegin = part.refEndPos; + } + + if (exon.getBeginPos() == partBegin && exon.getEndPos() == partEnd) { + result += 1; + } + + i += 1; + } + return result; + } + + private TranscriptModelBuilder findBestModelByCdnaMatch(List possibleModels) { + TranscriptModelBuilder bestMatch = possibleModels.get(0); + int bestMatchCount = countCdnaMatches(bestMatch); for (TranscriptModelBuilder current : possibleModels) { - if (current.getTXRegion().length() > longestCds.getTXRegion().length()) { - longestCds = current; + int currentMatchCount = countCdnaMatches(current); + if (currentMatchCount > bestMatchCount) { + bestMatch = current; + bestMatchCount = currentMatchCount; } } - return longestCds; + return bestMatch; } } diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/GenomeVariantNormalizer.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/GenomeVariantNormalizer.java index 6075c5f001..91125e6af6 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/GenomeVariantNormalizer.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/GenomeVariantNormalizer.java @@ -53,7 +53,7 @@ public static GenomeVariant normalizeInsertion(TranscriptModel transcript, Genom // Insert the ALT bases at the position indicated by txPos. int pos = txPos.getPos(); - StringBuilder builder = new StringBuilder(transcript.getSequence()); + StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence()); builder.insert(pos, change.getAlt()); // Execute algorithm and compute the shift. @@ -106,7 +106,7 @@ public static GenomeVariant normalizeDeletion(TranscriptModel transcript, Genome // Shift the deletion to the 3' (right) end of the transcript. int pos = txPos.getPos(); final int LEN = change.getRef().length(); // length of the deletion - final String seq = transcript.getSequence(); + final String seq = transcript.getTrimmedSequence(); int shift = 0; while ((pos + LEN < seq.length()) && (seq.charAt(pos) == seq.charAt(pos + LEN))) { diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptInterval.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptInterval.java index c1255662e6..ca99b3f4e5 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptInterval.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptInterval.java @@ -91,7 +91,7 @@ public TranscriptPosition getTranscriptEndPos() { */ @Override public String toString() { - return StringUtil.concatenate(transcript.getAccession(), ":n.", beginPos + 1, "-", endPos); + return StringUtil.concatenate(transcript.getAccession(), ":n.", beginPos + 1, "_", endPos); } /* diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptModel.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptModel.java index a31e21d47f..1991bb06ee 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptModel.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptModel.java @@ -167,6 +167,15 @@ public String getSequence() { return sequence; } + /** + * @return mDNA sequence of the spliced RNA of this known gene transcript with leading and + * trailing sequences removed. + */ + public String getTrimmedSequence() { + final Alignment ali = seqAlignment; + return sequence.substring(ali.refLeadingGapLength(), sequence.length() - ali.refTrailingGapLength()); + } + /** * @Return the sequence alignment to the exonic genome reference */ diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptProjectionDecorator.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptProjectionDecorator.java index 90190e5cd3..e45e58bf8e 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptProjectionDecorator.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptProjectionDecorator.java @@ -47,7 +47,7 @@ public String getCDSTranscript() { try { TranscriptPosition tBeginPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeBeginPos()); TranscriptPosition tEndPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeEndPos()); - return transcript.getSequence().substring(tBeginPos.getPos(), tEndPos.getPos()); + return transcript.getTrimmedSequence().substring(tBeginPos.getPos(), tEndPos.getPos()); } catch (ProjectionException e) { throw new Error("Bug: CDS begin/end must be translatable into transcript positions"); } @@ -59,7 +59,7 @@ public String getCDSTranscript() { public String getTranscriptStartingAtCDS() { try { TranscriptPosition tBeginPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeBeginPos()); - return transcript.getSequence().substring(tBeginPos.getPos(), transcript.getSequence().length()); + return transcript.getTrimmedSequence().substring(tBeginPos.getPos(), transcript.getTrimmedSequence().length()); } catch (ProjectionException e) { throw new Error("Bug: CDS begin must be translatable into transcript positions"); } @@ -88,7 +88,8 @@ public TranscriptPosition genomeToTranscriptPos(GenomePosition pos) throws Proje int posInExon = pos.differenceTo(region.getGenomeBeginPos()); int transcriptPos = tOffset + posInExon; // Project for position on genomic exons to position in sequence. - int projectedTranscriptPos = transcript.getSeqAlignment().projectRefToQry(transcriptPos); + int projectedTranscriptPos = transcript.getSeqAlignment().projectRefToQry(transcriptPos) - + transcript.getSeqAlignment().refLeadingGapLength(); return new TranscriptPosition(transcript, projectedTranscriptPos, PositionType.ZERO_BASED); } tOffset += region.length(); diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceChangeHelper.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceChangeHelper.java index c98fb6cfcd..bc33e7dbc5 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceChangeHelper.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceChangeHelper.java @@ -50,7 +50,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) { TranscriptSequenceOntologyDecorator soDecorator = new TranscriptSequenceOntologyDecorator(transcript); if (!transcript.getTXRegion().overlapsWith(change.getGenomeInterval()) || !soDecorator.overlapsWithExon(change.getGenomeInterval())) - return transcript.getSequence(); // non-coding change, does not affect transcript + return transcript.getTrimmedSequence(); // non-coding change, does not affect transcript // Get transcript position for the change position. TranscriptProjectionDecorator projector = new TranscriptProjectionDecorator(transcript); @@ -62,7 +62,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) { } // Update base in string using StringBuilder. - StringBuilder builder = new StringBuilder(transcript.getSequence()); + StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence()); if (change.getType() == GenomeVariantType.SNV) builder.setCharAt(tPos.getPos(), change.getAlt().charAt(0)); else @@ -73,7 +73,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) { private String getTranscriptWithRangeInRefAffected(GenomeVariant change) { // Short-circuit in the case of change that does not affect the transcript. if (!transcript.getTXRegion().overlapsWith(change.getGenomeInterval())) - return transcript.getSequence(); + return transcript.getTrimmedSequence(); // Get transcript begin and end position. GenomePosition changeBeginPos = change.getGenomeInterval().getGenomeBeginPos(); @@ -94,7 +94,7 @@ private String getTranscriptWithRangeInRefAffected(GenomeVariant change) { } // Build resulting transcript string. - StringBuilder builder = new StringBuilder(transcript.getSequence()); + StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence()); builder.delete(tBeginPos.getPos(), tEndPos.getPos()); builder.insert(tBeginPos.getPos(), change.getAlt()); return builder.toString(); diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceDecorator.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceDecorator.java index 0e50c1ecf5..7fc7430570 100644 --- a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceDecorator.java +++ b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceDecorator.java @@ -80,10 +80,10 @@ public String getCodonAt(TranscriptPosition txPos, CDSPosition cdsPos) throws In int frameShift = cdsPos.getPos() % 3; int codonStart = txPos.getPos() - frameShift; // codon start in transcript string int endPos = codonStart + 3; - if (transcript.getSequence().length() < endPos) + if (transcript.getTrimmedSequence().length() < endPos) throw new InvalidCodonException("Could not access codon " + codonStart + " - " + endPos - + ", transcript sequence length is " + transcript.getSequence().length()); - return transcript.getSequence().substring(codonStart, endPos); + + ", transcript sequence length is " + transcript.getTrimmedSequence().length()); + return transcript.getTrimmedSequence().substring(codonStart, endPos); } /** @@ -102,9 +102,9 @@ public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos int frameShift = cdsPos.getPos() % 3; int codonStart = txPos.getPos() - frameShift; // codon start in transcript string int endPos = codonStart + 3 * count; - if (endPos > transcript.getSequence().length()) - endPos = transcript.getSequence().length(); - return transcript.getSequence().substring(codonStart, endPos); + if (endPos > transcript.getTrimmedSequence().length()) + endPos = transcript.getTrimmedSequence().length(); + return transcript.getTrimmedSequence().substring(codonStart, endPos); } /** @@ -116,7 +116,7 @@ public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos * @return the codon affected by a change at the given position */ public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos) { - return getCodonsStartingFrom(txPos, cdsPos, transcript.getSequence().length()); + return getCodonsStartingFrom(txPos, cdsPos, transcript.getTrimmedSequence().length()); } } diff --git a/jannovar-core/src/test/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParserTest.java b/jannovar-core/src/test/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParserTest.java index 0b036738ac..19f0bd4e3b 100644 --- a/jannovar-core/src/test/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParserTest.java +++ b/jannovar-core/src/test/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParserTest.java @@ -414,16 +414,16 @@ private String printTranscriptModel(TranscriptModel transcriptModel) { TranscriptProjectionDecorator tx0Decorator = new TranscriptProjectionDecorator(tx0); TranscriptPosition tx0Pos0 = tx0Decorator .genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870425)); - assertEquals(1, tx0Pos0.getPos()); + assertEquals(0, tx0Pos0.getPos()); TranscriptPosition tx0Pos1 = tx0Decorator .genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870424)); - assertEquals(2, tx0Pos1.getPos()); + assertEquals(1, tx0Pos1.getPos()); TranscriptPosition tx0Pos2 = tx0Decorator .genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870423)); - assertEquals(3, tx0Pos2.getPos()); + assertEquals(2, tx0Pos2.getPos()); TranscriptPosition tx0Pos3 = tx0Decorator .genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870422)); - assertEquals(4, tx0Pos3.getPos()); + assertEquals(3, tx0Pos3.getPos()); } /**