From 2ce2cedc120e3afd091b8ae6eed8b37a75041776 Mon Sep 17 00:00:00 2001 From: fairliereese Date: Thu, 23 Feb 2023 14:28:01 -0800 Subject: [PATCH] updated rest of files to pyfaidx. fixed int overflow error --- TranscriptClean.py | 18 +++--- intronBound.py | 16 +++-- testing_suite/test_all_jns_annotated.py | 11 ++-- testing_suite/test_attempt_jn_correction.py | 2 +- testing_suite/test_canonOnly_mode.py | 2 +- testing_suite/test_combined_jn_dist.py | 2 +- .../test_compare_seq_cigar_lengths.py | 2 +- testing_suite/test_compute_MD_NM.py | 2 +- testing_suite/test_compute_jI_jM.py | 2 +- testing_suite/test_correct_deletions.py | 2 +- testing_suite/test_correct_insertions.py | 2 +- testing_suite/test_correct_mismatches.py | 2 +- testing_suite/test_example_transcripts.py | 2 +- testing_suite/test_example_varDM_monoexon.py | 2 +- testing_suite/test_fetch_donor_or_acceptor.py | 2 +- .../test_find_closest_annotated_SJ.py | 2 +- .../test_find_closest_annotated_bound.py | 2 +- testing_suite/test_find_ncsj.py | 2 +- .../test_fix_one_side_of_junction.py | 2 +- testing_suite/test_full_dryrun.py | 2 +- .../test_high_level_ncsj_correction.py | 2 +- testing_suite/test_init_transcript_object.py | 2 +- testing_suite/test_print_fasta_seq.py | 2 +- testing_suite/test_rmTmp_option.py | 2 +- .../test_update_post_ncsj_correction.py | 2 +- transcript.py | 60 ++++++++++--------- 26 files changed, 81 insertions(+), 68 deletions(-) diff --git a/TranscriptClean.py b/TranscriptClean.py index 68092fe..d52c3bf 100644 --- a/TranscriptClean.py +++ b/TranscriptClean.py @@ -332,14 +332,14 @@ def transcript_init(transcript_line, genome, sjAnnot): if logInfo.Mapping != "primary": return None, logInfo - try: - transcript = Transcript(sam_fields, genome, sjAnnot) + # try: + transcript = Transcript(sam_fields, genome, sjAnnot) - except Exception as e: - warnings.warn("Problem parsing transcript with ID '" + - logInfo.TranscriptID + "'") - print(e) - return None, logInfo + # except Exception as e: + # warnings.warn("Problem parsing transcript with ID '" + + # logInfo.TranscriptID + "'") + # print(e) + # return None, logInfo return transcript, logInfo @@ -1365,7 +1365,9 @@ def combinedJunctionDist(dist_0, dist_1): """ # If dist_0 and dist_1 have different signs, the combined distance is # the sum of their absolute values - if dist_0*dist_1 <= 0: + # https://stackoverflow.com/questions/66882/simplest-way-to-check-if-two-integers-have-same-sign + if ((dist_0<0)!=(dist_1<0)): + # if dist_0*dist_1 <= 0: combined_dist = abs(dist_0) + abs(dist_1) else: combined_dist = abs(abs(dist_0) - abs(dist_1)) diff --git a/intronBound.py b/intronBound.py index ff848c7..fae293f 100644 --- a/intronBound.py +++ b/intronBound.py @@ -19,7 +19,7 @@ def __init__(self, transcriptID, jnNumber, ibNumber, chrom, pos, strand): self.strand = strand def getBED(self): - """ Format the intron boundary with 0-based start and end. + """ Format the intron boundary with 0-based start and end. If mode == "start", we return a bed for the start position.""" bedstr = "\t".join([self.chrom, str(self.pos - 1), str(self.pos), @@ -31,9 +31,15 @@ def getSpliceMotif(self, genome): intron (first two if bound == 0 and last two if bound == 1) """ if self.bound == 0: - motif = genome.sequence( - {'chr': self.chrom, 'start': self.pos, 'stop': self.pos + 1}, one_based=True) + # motif = genome.sequence( + # {'chr': self.chrom, 'start': self.pos, 'stop': self.pos + 1}, one_based=True) + motif = genome.get_seq(self.chrom, + self.pos, + self.pos+1).seq else: - motif = genome.sequence( - {'chr': self.chrom, 'start': self.pos - 1, 'stop': self.pos}, one_based=True) + # motif = genome.sequence( + # {'chr': self.chrom, 'start': self.pos - 1, 'stop': self.pos}, one_based=True) + motif = genome.get_seq(self.chrom, + self.pos-1, + self.pos).seq return motif diff --git a/testing_suite/test_all_jns_annotated.py b/testing_suite/test_all_jns_annotated.py index edd3ba4..4e994ee 100644 --- a/testing_suite/test_all_jns_annotated.py +++ b/testing_suite/test_all_jns_annotated.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 @@ -11,7 +11,7 @@ class TestAllSJsAnnot(object): def test_no_jns(self): """ Return transcript.allJnsAnnotated = True for a transcript without junctions""" - + sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam" genome = Fasta("input_files/hg38_chr1.fa") @@ -56,7 +56,7 @@ def test_two_annotated_SJs_without_ref(self): assert transcript.isCanonical == True def test_noncanonical(self): - """ Transcript should be noncanonical and un-annotated prior to + """ Transcript should be noncanonical and un-annotated prior to correction, but be canonical and annotated afterwards """ sam = "input_files/sams/deletion_insertion_mismatch_nc.sam" @@ -70,15 +70,14 @@ def test_noncanonical(self): with open(sam, 'r') as f: sam_line = f.readline().strip() - transcript, logInfo = TC.transcript_init(sam_line, refs.genome, + transcript, logInfo = TC.transcript_init(sam_line, refs.genome, refs.sjAnnot) assert transcript.allJnsAnnotated == False assert transcript.isCanonical == False # Now correct the junction and retest - upd_transcript, TE = TC.cleanNoncanonical(transcript, refs, 5, logInfo) + upd_transcript, TE = TC.cleanNoncanonical(transcript, refs, 5, logInfo) assert upd_transcript.allJnsAnnotated == True assert upd_transcript.isCanonical == True - diff --git a/testing_suite/test_attempt_jn_correction.py b/testing_suite/test_attempt_jn_correction.py index 1b80f05..4caba48 100644 --- a/testing_suite/test_attempt_jn_correction.py +++ b/testing_suite/test_attempt_jn_correction.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 diff --git a/testing_suite/test_canonOnly_mode.py b/testing_suite/test_canonOnly_mode.py index a044448..a410c81 100644 --- a/testing_suite/test_canonOnly_mode.py +++ b/testing_suite/test_canonOnly_mode.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys import os import subprocess diff --git a/testing_suite/test_combined_jn_dist.py b/testing_suite/test_combined_jn_dist.py index c322b45..13fc3b9 100644 --- a/testing_suite/test_combined_jn_dist.py +++ b/testing_suite/test_combined_jn_dist.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import TranscriptClean as TC diff --git a/testing_suite/test_compare_seq_cigar_lengths.py b/testing_suite/test_compare_seq_cigar_lengths.py index 20d2a60..d205429 100644 --- a/testing_suite/test_compare_seq_cigar_lengths.py +++ b/testing_suite/test_compare_seq_cigar_lengths.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 diff --git a/testing_suite/test_compute_MD_NM.py b/testing_suite/test_compute_MD_NM.py index a8089e4..9f65748 100644 --- a/testing_suite/test_compute_MD_NM.py +++ b/testing_suite/test_compute_MD_NM.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 diff --git a/testing_suite/test_compute_jI_jM.py b/testing_suite/test_compute_jI_jM.py index 111bf9e..839d8c8 100644 --- a/testing_suite/test_compute_jI_jM.py +++ b/testing_suite/test_compute_jI_jM.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 diff --git a/testing_suite/test_correct_deletions.py b/testing_suite/test_correct_deletions.py index 3bc6bb7..bb383e7 100644 --- a/testing_suite/test_correct_deletions.py +++ b/testing_suite/test_correct_deletions.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 diff --git a/testing_suite/test_correct_insertions.py b/testing_suite/test_correct_insertions.py index b7422d3..cab58e5 100644 --- a/testing_suite/test_correct_insertions.py +++ b/testing_suite/test_correct_insertions.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 diff --git a/testing_suite/test_correct_mismatches.py b/testing_suite/test_correct_mismatches.py index ce19bb5..b72f4df 100644 --- a/testing_suite/test_correct_mismatches.py +++ b/testing_suite/test_correct_mismatches.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 diff --git a/testing_suite/test_example_transcripts.py b/testing_suite/test_example_transcripts.py index d41cffd..f7b2e7b 100644 --- a/testing_suite/test_example_transcripts.py +++ b/testing_suite/test_example_transcripts.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys import os import subprocess diff --git a/testing_suite/test_example_varDM_monoexon.py b/testing_suite/test_example_varDM_monoexon.py index 7d2acf1..80d15d8 100644 --- a/testing_suite/test_example_varDM_monoexon.py +++ b/testing_suite/test_example_varDM_monoexon.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys import os import subprocess diff --git a/testing_suite/test_fetch_donor_or_acceptor.py b/testing_suite/test_fetch_donor_or_acceptor.py index e4e7be7..87debf4 100644 --- a/testing_suite/test_fetch_donor_or_acceptor.py +++ b/testing_suite/test_fetch_donor_or_acceptor.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import spliceJunction as sj diff --git a/testing_suite/test_find_closest_annotated_SJ.py b/testing_suite/test_find_closest_annotated_SJ.py index a40f7f2..b0049aa 100644 --- a/testing_suite/test_find_closest_annotated_SJ.py +++ b/testing_suite/test_find_closest_annotated_SJ.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import spliceJunction as sj diff --git a/testing_suite/test_find_closest_annotated_bound.py b/testing_suite/test_find_closest_annotated_bound.py index e2d3f81..fa5b0ee 100644 --- a/testing_suite/test_find_closest_annotated_bound.py +++ b/testing_suite/test_find_closest_annotated_bound.py @@ -6,7 +6,7 @@ import intronBound as ib import spliceJunction as sj import pytest -from pyfasta import Fasta +from pyfaidx import Fasta @pytest.mark.unit diff --git a/testing_suite/test_find_ncsj.py b/testing_suite/test_find_ncsj.py index 4101c19..aafbda6 100644 --- a/testing_suite/test_find_ncsj.py +++ b/testing_suite/test_find_ncsj.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 diff --git a/testing_suite/test_fix_one_side_of_junction.py b/testing_suite/test_fix_one_side_of_junction.py index fd4558a..6b2efdb 100644 --- a/testing_suite/test_fix_one_side_of_junction.py +++ b/testing_suite/test_fix_one_side_of_junction.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 diff --git a/testing_suite/test_full_dryrun.py b/testing_suite/test_full_dryrun.py index 130d031..503bf27 100644 --- a/testing_suite/test_full_dryrun.py +++ b/testing_suite/test_full_dryrun.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys import os import subprocess diff --git a/testing_suite/test_high_level_ncsj_correction.py b/testing_suite/test_high_level_ncsj_correction.py index 619f2c0..86fd764 100644 --- a/testing_suite/test_high_level_ncsj_correction.py +++ b/testing_suite/test_high_level_ncsj_correction.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import os import sys sys.path.append("..") diff --git a/testing_suite/test_init_transcript_object.py b/testing_suite/test_init_transcript_object.py index 1038a59..5129fe1 100644 --- a/testing_suite/test_init_transcript_object.py +++ b/testing_suite/test_init_transcript_object.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import TranscriptClean as TC diff --git a/testing_suite/test_print_fasta_seq.py b/testing_suite/test_print_fasta_seq.py index c355a63..7e35f50 100644 --- a/testing_suite/test_print_fasta_seq.py +++ b/testing_suite/test_print_fasta_seq.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as ts diff --git a/testing_suite/test_rmTmp_option.py b/testing_suite/test_rmTmp_option.py index b325f91..579cc53 100644 --- a/testing_suite/test_rmTmp_option.py +++ b/testing_suite/test_rmTmp_option.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys import os import subprocess diff --git a/testing_suite/test_update_post_ncsj_correction.py b/testing_suite/test_update_post_ncsj_correction.py index 0e216c7..353f9f6 100644 --- a/testing_suite/test_update_post_ncsj_correction.py +++ b/testing_suite/test_update_post_ncsj_correction.py @@ -1,5 +1,5 @@ import pytest -from pyfasta import Fasta +from pyfaidx import Fasta import sys sys.path.append("..") import transcript as t2 diff --git a/transcript.py b/transcript.py index 460de59..49bacfa 100644 --- a/transcript.py +++ b/transcript.py @@ -72,7 +72,7 @@ def __init__(self, samFields, genome, spliceAnnot): self.otherFields = "\t".join(otherFields) def recheckJnsAnnotated(self): - """ Check the splice motif of each splice junction to determine + """ Check the splice motif of each splice junction to determine whether the transcript overall is annotated """ for jn in self.spliceJunctions: if int(jn.motif_code) < 20: @@ -115,8 +115,8 @@ def compute_transcript_end(self): return end - 1 def splitCIGAR(self): - """ Takes CIGAR string from SAM and splits it into two lists: - one with capital letters (match operators), and one with + """ Takes CIGAR string from SAM and splits it into two lists: + one with capital letters (match operators), and one with the number of bases that each operation applies to. """ #alignTypes = re.sub('[0-9]', " ", self.CIGAR).split() @@ -126,8 +126,8 @@ def splitCIGAR(self): return splitCIGARstr(self.CIGAR) # alignTypes, counts def splitMD(self): - """ Takes MD tag and splits into two lists: - one with capital letters (match operators), and one with + """ Takes MD tag and splits into two lists: + one with capital letters (match operators), and one with the number of bases that each operation applies to. """ MD = str(self.MD).split(":")[2] @@ -157,9 +157,9 @@ def splitMD(self): return operations, counts def mergeMDwithCIGAR(self): - """ Takes the MD and CIGAR strings, and combines them into a unified + """ Takes the MD and CIGAR strings, and combines them into a unified structure that encodes all possible operations w.r.t the reference: - match, mismatch, deletion, insertion, hard clipping, + match, mismatch, deletion, insertion, hard clipping, and soft clipping. """ mergeCounts = [] @@ -213,7 +213,7 @@ def mergeMDwithCIGAR(self): return mergeOperations, mergeCounts def parseSpliceJunctions(self, genome, spliceAnnot): - """ Takes the splice junction information from the SAM input and + """ Takes the splice junction information from the SAM input and creates a SpliceJunction object for each junction.""" intronBounds = ((self.jI).split(":")[-1]).split(",")[1:] @@ -278,7 +278,7 @@ def getAllIntronBounds(self): return result def getNMandMDFlags(self, genome): - """ This function uses the transcript sequence, its CIGAR string, + """ This function uses the transcript sequence, its CIGAR string, and the reference genome to create NM and MD sam flags.""" NM = 0 MD = "MD:Z:" @@ -297,8 +297,9 @@ def getNMandMDFlags(self, genome): if op == "M": for i in range(0, ct): currBase = self.SEQ[seqPos] - refBase = genome.sequence({'chr': self.CHROM, 'start': genomePos, - 'stop': genomePos}, one_based=True) + refBase = genome.get_seq(self.CHROM, genomePos, genomePos).seq + # refBase = genome.sequence({'chr': self.CHROM, 'start': genomePos, + # 'stop': genomePos}, one_based=True) if refBase == "": return None, None @@ -319,8 +320,10 @@ def getNMandMDFlags(self, genome): # End any match we have going and add the missing reference bases MD = MD + str(MVal) MVal = 0 - refBases = genome.sequence({'chr': self.CHROM, 'start': genomePos, - 'stop': genomePos + ct - 1}, one_based=True) + refBases = genome.get_seq(self.CHROM, genomePos, + genomePos+ct-1).seq + # refBases = genome.sequence({'chr': self.CHROM, 'start': genomePos, + # 'stop': genomePos + ct - 1}, one_based=True) if refBases == "": return None, None MD = MD + "^" + str(refBases) @@ -390,11 +393,11 @@ def compute_jI(self): return jIstr def getjMandjITags(self, genome, spliceAnnot): - """ If the input sam file doesn't have the custom STARlong-derived jM - and jI tags, we need to compute them. This is done by stepping - through the CIGAR string and sequence. When an intron (N) is - encountered, we check the first two bases and last two bases of - the intron in the genome sequence to detemine whether they are + """ If the input sam file doesn't have the custom STARlong-derived jM + and jI tags, we need to compute them. This is done by stepping + through the CIGAR string and sequence. When an intron (N) is + encountered, we check the first two bases and last two bases of + the intron in the genome sequence to detemine whether they are canonical. We also record the start and end position of the intron. """ seq = self.SEQ @@ -410,14 +413,17 @@ def getjMandjITags(self, genome, spliceAnnot): if op == "N": # This is an intron intronStart = genomePos - startBases = genome.sequence({'chr': self.CHROM, - 'start': genomePos, - 'stop': genomePos + 1}, - one_based=True) + # startBases = genome.sequence({'chr': self.CHROM, + # 'start': genomePos, + # 'stop': genomePos + 1}, + # one_based=True) + startBases = genome.get_seq(self.CHROM, genomePos, + genomePos+1).seq intronEnd = genomePos + ct - 1 - endBases = genome.sequence({'chr': self.CHROM, - 'start': intronEnd - 1, - 'stop': intronEnd}, one_based=True) + # endBases = genome.sequence({'chr': self.CHROM, + # 'start': intronEnd - 1, + # 'stop': intronEnd}, one_based=True) + endBases = genome.get_seq(self.CHROM, intronEnd-1, intron_end).seq # Check if junction is annotated if self.strand == "+": @@ -465,7 +471,7 @@ def base_wise_CIGAR(self): def fetch_region_sequence(self, chromosome, start, end): """ Walks the SAM sequence to return the bases in the specified region. - Returns None if the sequence is not available, i.e. because the + Returns None if the sequence is not available, i.e. because the region does not overlap with the transcript, or because it is in an intron. Supplied coordinates should be 1-based. """ @@ -533,7 +539,7 @@ def getSJMotifCode(startBases, endBases): def reverseComplement(seq): - """ Returns the reverse complement of a DNA sequence, + """ Returns the reverse complement of a DNA sequence, retaining the case of each letter""" complement = ""