Skip to content

Commit

Permalink
Fixing projection issues.
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Nov 13, 2019
1 parent 53fff52 commit 3dbfada
Show file tree
Hide file tree
Showing 11 changed files with 90 additions and 44 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
* Making faMT annotation for refSeq optional
* Update RefSeq parser that does not run into null-pointer exceptions on mouse and rat genome (e.g. when no exon defines the gene name)
* Fixing issue with block substitutions (#475).
* Fixing RefSeq build to properly assign transcript model (use best match with alignment) in the case of duplicates.
* Fixing issue in projection in the case of leading gaps (has no effect on CDS position prediction).
* Adding `TranscriptModel.getTrimmedSequence()` that removes leading and trailing (unaligned in RefSeq) sequence.

### jannovar-htsjdk

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ protected NucleotideChange getCDSNTChange() {
} catch (ProjectionException e) {
throw new Error("Bug: at this point, the position must be a transcript position");
}
if (DuplicationChecker.isDuplication(transcript.getSequence(), change.getAlt(), txPos.getPos())) {
if (DuplicationChecker.isDuplication(transcript.getTrimmedSequence(), change.getAlt(), txPos.getPos())) {
NucleotidePointLocationBuilder posBuilder = new NucleotidePointLocationBuilder(transcript);
if (change.getAlt().length() == 1) {
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ private Annotation buildCDSExonicAnnotation() {

// Check that the WT nucleotide from the transcript is consistent with change.ref and generate a warning message
// if this is not the case.
if (txPos.getPos() >= transcript.getSequence().length()
|| !transcript.getSequence().substring(txPos.getPos(), txPos.getPos() + 1).equals(change.getRef()))
if (txPos.getPos() >= transcript.getTrimmedSequence().length()
|| !transcript.getTrimmedSequence().substring(txPos.getPos(), txPos.getPos() + 1).equals(change.getRef()))
messages.add(AnnotationMessage.WARNING_REF_DOES_NOT_MATCH_TRANSCRIPT);

// Compute the frame shift and codon start position.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ public ImmutableList<TranscriptModel> run() throws TranscriptParseException {
// Load the FASTA file and assign to the builders.
final String pathFASTA = PathUtil.join(basePath, getINIFileName("rna"));
loadMitochondrialFASTA(builders);

loadFASTA(builders, pathFASTA);

// Create final list of TranscriptModels.
Expand Down Expand Up @@ -158,13 +158,13 @@ private void loadMitochondrialFASTA(Map<String, TranscriptModelBuilder> builders
LOGGER.warn("Key for chrMT FASTA File does not exist, skipping.");
return;
}

final String pathFasta = PathUtil.join(basePath, getINIFileName("faMT"));
if (!new File(pathFasta).exists()) {
LOGGER.warn("The chrMT FASTA File {} does not exist, skipping.", new Object[]{ pathFasta });
return;
}


final int idMT = refDict.getContigNameToID().get("chrMT");

Expand Down Expand Up @@ -303,7 +303,7 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)

Set<String> wantedTypes = Sets.newHashSet("exon", "CDS", "stop_codon");
boolean onlyCurated = onlyCurated();

// Some exons do not have a gene name in mm9 and rat releases.
// Therefore we select collect the name from the gene.
Map<String, String> entrezMap = new HashMap<>();
Expand All @@ -327,13 +327,13 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
boolean isMT = contigDict.containsKey("chrMT") &&
contigDict.get("chrMT").equals(contigDict.get(record.getSeqID()));
if ("gene".equals(record.getType())) {

String entrezId = parseGeneID(record);

String symbol = null;
symbol = record.getAttributes().get("gene");
entrezMap.put(entrezId, symbol);

// Register chrMT Entrez IDs to the "MT..." gene name.
if (isMT) {
if (record.getAttributes().get("gene_synonym") == null) {
Expand All @@ -349,7 +349,7 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
mtEntrezMap.put(entrezId, symbol);
}
}


// Handle the records describing the transcript structure.
//
Expand Down Expand Up @@ -380,12 +380,12 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
}

String entrezId = parseGeneID(record);

// // skip on chrMT some tRNA that do not have any Dbxref. This happens in mm9 and rat v6
// if (record.getAttributes().get("gbkey") == "tRNA" && entrezId == null) {
// continue;
// }

transcriptId = mtEntrezMap.get(entrezId);
builder.setAccession(transcriptId);
}
Expand Down Expand Up @@ -511,9 +511,9 @@ private TranscriptModelBuilder createNewTranscriptModelBuilder(FeatureRecord rec
builder.setTxVersion(record.getAttributes().get("transcript_version"));
String geneID = parseGeneID(record);
builder.setGeneID(geneID);

String gene = record.getAttributes().get("gene");

if (gene == null && geneID != null && notMtEntrezMap.containsKey(geneID)) {
gene = notMtEntrezMap.get(geneID);
}
Expand Down Expand Up @@ -646,14 +646,15 @@ private Map<String, TranscriptModelBuilder> mapNonRedundantTranscriptIdsToTransc
} else {
duplicatedTranscriptIdCount++;
// This is a tricky call - there are about 30 transcripts with duplicated transcriptIds due to imperfect
// alignment to the genomic reference
LOGGER.warn("Transcript {} has {} possible transcript models - using the longest model", transcriptId, parentIds
// alignment to the genomic reference. We chose the one with the best match between alignment from cDNA_match
// and exons.
LOGGER.warn("Transcript {} has {} possible transcript models - picking best by cDNA_match", transcriptId, parentIds
.size());
List<TranscriptModelBuilder> possibleModels = parentIds.stream()
.map(transcriptModelsWithTxRegion::get)
.collect(toList());
TranscriptModelBuilder longestCds = findLongestTranscriptRegion(possibleModels);
transcriptIdToTranscriptModelBuilders.put(transcriptId, longestCds);
TranscriptModelBuilder bestModel = findBestModelByCdnaMatch(possibleModels);
transcriptIdToTranscriptModelBuilders.put(transcriptId, bestModel);
}
}
}
Expand All @@ -663,13 +664,45 @@ private Map<String, TranscriptModelBuilder> mapNonRedundantTranscriptIdsToTransc
return transcriptIdToTranscriptModelBuilders;
}

private TranscriptModelBuilder findLongestTranscriptRegion(List<TranscriptModelBuilder> possibleModels) {
TranscriptModelBuilder longestCds = possibleModels.get(0);
private static int countCdnaMatches(TranscriptModelBuilder model) {
int result = 0;
int i = 0;
for (AlignmentPart part : model.getAlignmentParts()) {
if (i >= model.getExonRegions().size()) {
continue;
}
final GenomeInterval exon = model.getExonRegions().get(i).withStrand(Strand.FWD);

final int partBegin;
final int partEnd;

if (part.refBeginPos < part.refEndPos) {
partBegin = part.refBeginPos;
partEnd = part.refEndPos;
} else {
partEnd = part.refBeginPos;
partBegin = part.refEndPos;
}

if (exon.getBeginPos() == partBegin && exon.getEndPos() == partEnd) {
result += 1;
}

i += 1;
}
return result;
}

private TranscriptModelBuilder findBestModelByCdnaMatch(List<TranscriptModelBuilder> possibleModels) {
TranscriptModelBuilder bestMatch = possibleModels.get(0);
int bestMatchCount = countCdnaMatches(bestMatch);
for (TranscriptModelBuilder current : possibleModels) {
if (current.getTXRegion().length() > longestCds.getTXRegion().length()) {
longestCds = current;
int currentMatchCount = countCdnaMatches(current);
if (currentMatchCount > bestMatchCount) {
bestMatch = current;
bestMatchCount = currentMatchCount;
}
}
return longestCds;
return bestMatch;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public static GenomeVariant normalizeInsertion(TranscriptModel transcript, Genom

// Insert the ALT bases at the position indicated by txPos.
int pos = txPos.getPos();
StringBuilder builder = new StringBuilder(transcript.getSequence());
StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence());
builder.insert(pos, change.getAlt());

// Execute algorithm and compute the shift.
Expand Down Expand Up @@ -106,7 +106,7 @@ public static GenomeVariant normalizeDeletion(TranscriptModel transcript, Genome
// Shift the deletion to the 3' (right) end of the transcript.
int pos = txPos.getPos();
final int LEN = change.getRef().length(); // length of the deletion
final String seq = transcript.getSequence();
final String seq = transcript.getTrimmedSequence();
int shift = 0;

while ((pos + LEN < seq.length()) && (seq.charAt(pos) == seq.charAt(pos + LEN))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public TranscriptPosition getTranscriptEndPos() {
*/
@Override
public String toString() {
return StringUtil.concatenate(transcript.getAccession(), ":n.", beginPos + 1, "-", endPos);
return StringUtil.concatenate(transcript.getAccession(), ":n.", beginPos + 1, "_", endPos);
}

/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,15 @@ public String getSequence() {
return sequence;
}

/**
* @return mDNA sequence of the spliced RNA of this known gene transcript with leading and
* trailing sequences removed.
*/
public String getTrimmedSequence() {
final Alignment ali = seqAlignment;
return sequence.substring(ali.refLeadingGapLength(), sequence.length() - ali.refTrailingGapLength());
}

/**
* @Return the sequence alignment to the exonic genome reference
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public String getCDSTranscript() {
try {
TranscriptPosition tBeginPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeBeginPos());
TranscriptPosition tEndPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeEndPos());
return transcript.getSequence().substring(tBeginPos.getPos(), tEndPos.getPos());
return transcript.getTrimmedSequence().substring(tBeginPos.getPos(), tEndPos.getPos());
} catch (ProjectionException e) {
throw new Error("Bug: CDS begin/end must be translatable into transcript positions");
}
Expand All @@ -59,7 +59,7 @@ public String getCDSTranscript() {
public String getTranscriptStartingAtCDS() {
try {
TranscriptPosition tBeginPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeBeginPos());
return transcript.getSequence().substring(tBeginPos.getPos(), transcript.getSequence().length());
return transcript.getTrimmedSequence().substring(tBeginPos.getPos(), transcript.getTrimmedSequence().length());
} catch (ProjectionException e) {
throw new Error("Bug: CDS begin must be translatable into transcript positions");
}
Expand Down Expand Up @@ -88,7 +88,8 @@ public TranscriptPosition genomeToTranscriptPos(GenomePosition pos) throws Proje
int posInExon = pos.differenceTo(region.getGenomeBeginPos());
int transcriptPos = tOffset + posInExon;
// Project for position on genomic exons to position in sequence.
int projectedTranscriptPos = transcript.getSeqAlignment().projectRefToQry(transcriptPos);
int projectedTranscriptPos = transcript.getSeqAlignment().projectRefToQry(transcriptPos) -
transcript.getSeqAlignment().refLeadingGapLength();
return new TranscriptPosition(transcript, projectedTranscriptPos, PositionType.ZERO_BASED);
}
tOffset += region.length();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) {
TranscriptSequenceOntologyDecorator soDecorator = new TranscriptSequenceOntologyDecorator(transcript);
if (!transcript.getTXRegion().overlapsWith(change.getGenomeInterval())
|| !soDecorator.overlapsWithExon(change.getGenomeInterval()))
return transcript.getSequence(); // non-coding change, does not affect transcript
return transcript.getTrimmedSequence(); // non-coding change, does not affect transcript

// Get transcript position for the change position.
TranscriptProjectionDecorator projector = new TranscriptProjectionDecorator(transcript);
Expand All @@ -62,7 +62,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) {
}

// Update base in string using StringBuilder.
StringBuilder builder = new StringBuilder(transcript.getSequence());
StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence());
if (change.getType() == GenomeVariantType.SNV)
builder.setCharAt(tPos.getPos(), change.getAlt().charAt(0));
else
Expand All @@ -73,7 +73,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) {
private String getTranscriptWithRangeInRefAffected(GenomeVariant change) {
// Short-circuit in the case of change that does not affect the transcript.
if (!transcript.getTXRegion().overlapsWith(change.getGenomeInterval()))
return transcript.getSequence();
return transcript.getTrimmedSequence();

// Get transcript begin and end position.
GenomePosition changeBeginPos = change.getGenomeInterval().getGenomeBeginPos();
Expand All @@ -94,7 +94,7 @@ private String getTranscriptWithRangeInRefAffected(GenomeVariant change) {
}

// Build resulting transcript string.
StringBuilder builder = new StringBuilder(transcript.getSequence());
StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence());
builder.delete(tBeginPos.getPos(), tEndPos.getPos());
builder.insert(tBeginPos.getPos(), change.getAlt());
return builder.toString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,10 @@ public String getCodonAt(TranscriptPosition txPos, CDSPosition cdsPos) throws In
int frameShift = cdsPos.getPos() % 3;
int codonStart = txPos.getPos() - frameShift; // codon start in transcript string
int endPos = codonStart + 3;
if (transcript.getSequence().length() < endPos)
if (transcript.getTrimmedSequence().length() < endPos)
throw new InvalidCodonException("Could not access codon " + codonStart + " - " + endPos
+ ", transcript sequence length is " + transcript.getSequence().length());
return transcript.getSequence().substring(codonStart, endPos);
+ ", transcript sequence length is " + transcript.getTrimmedSequence().length());
return transcript.getTrimmedSequence().substring(codonStart, endPos);
}

/**
Expand All @@ -102,9 +102,9 @@ public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos
int frameShift = cdsPos.getPos() % 3;
int codonStart = txPos.getPos() - frameShift; // codon start in transcript string
int endPos = codonStart + 3 * count;
if (endPos > transcript.getSequence().length())
endPos = transcript.getSequence().length();
return transcript.getSequence().substring(codonStart, endPos);
if (endPos > transcript.getTrimmedSequence().length())
endPos = transcript.getTrimmedSequence().length();
return transcript.getTrimmedSequence().substring(codonStart, endPos);
}

/**
Expand All @@ -116,7 +116,7 @@ public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos
* @return the codon affected by a change at the given position
*/
public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos) {
return getCodonsStartingFrom(txPos, cdsPos, transcript.getSequence().length());
return getCodonsStartingFrom(txPos, cdsPos, transcript.getTrimmedSequence().length());
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -414,16 +414,16 @@ private String printTranscriptModel(TranscriptModel transcriptModel) {
TranscriptProjectionDecorator tx0Decorator = new TranscriptProjectionDecorator(tx0);
TranscriptPosition tx0Pos0 = tx0Decorator
.genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870425));
assertEquals(1, tx0Pos0.getPos());
assertEquals(0, tx0Pos0.getPos());
TranscriptPosition tx0Pos1 = tx0Decorator
.genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870424));
assertEquals(2, tx0Pos1.getPos());
assertEquals(1, tx0Pos1.getPos());
TranscriptPosition tx0Pos2 = tx0Decorator
.genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870423));
assertEquals(3, tx0Pos2.getPos());
assertEquals(2, tx0Pos2.getPos());
TranscriptPosition tx0Pos3 = tx0Decorator
.genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870422));
assertEquals(4, tx0Pos3.getPos());
assertEquals(3, tx0Pos3.getPos());
}

/**
Expand Down

0 comments on commit 3dbfada

Please sign in to comment.