From 2ce2cedc120e3afd091b8ae6eed8b37a75041776 Mon Sep 17 00:00:00 2001
From: fairliereese <fairliek@comcast.net>
Date: Thu, 23 Feb 2023 14:28:01 -0800
Subject: [PATCH] updated rest of files to pyfaidx. fixed int overflow error

---
 TranscriptClean.py                            | 18 +++---
 intronBound.py                                | 16 +++--
 testing_suite/test_all_jns_annotated.py       | 11 ++--
 testing_suite/test_attempt_jn_correction.py   |  2 +-
 testing_suite/test_canonOnly_mode.py          |  2 +-
 testing_suite/test_combined_jn_dist.py        |  2 +-
 .../test_compare_seq_cigar_lengths.py         |  2 +-
 testing_suite/test_compute_MD_NM.py           |  2 +-
 testing_suite/test_compute_jI_jM.py           |  2 +-
 testing_suite/test_correct_deletions.py       |  2 +-
 testing_suite/test_correct_insertions.py      |  2 +-
 testing_suite/test_correct_mismatches.py      |  2 +-
 testing_suite/test_example_transcripts.py     |  2 +-
 testing_suite/test_example_varDM_monoexon.py  |  2 +-
 testing_suite/test_fetch_donor_or_acceptor.py |  2 +-
 .../test_find_closest_annotated_SJ.py         |  2 +-
 .../test_find_closest_annotated_bound.py      |  2 +-
 testing_suite/test_find_ncsj.py               |  2 +-
 .../test_fix_one_side_of_junction.py          |  2 +-
 testing_suite/test_full_dryrun.py             |  2 +-
 .../test_high_level_ncsj_correction.py        |  2 +-
 testing_suite/test_init_transcript_object.py  |  2 +-
 testing_suite/test_print_fasta_seq.py         |  2 +-
 testing_suite/test_rmTmp_option.py            |  2 +-
 .../test_update_post_ncsj_correction.py       |  2 +-
 transcript.py                                 | 60 ++++++++++---------
 26 files changed, 81 insertions(+), 68 deletions(-)

diff --git a/TranscriptClean.py b/TranscriptClean.py
index 68092fe..d52c3bf 100644
--- a/TranscriptClean.py
+++ b/TranscriptClean.py
@@ -332,14 +332,14 @@ def transcript_init(transcript_line, genome, sjAnnot):
     if logInfo.Mapping != "primary":
         return None, logInfo
 
-    try:
-        transcript = Transcript(sam_fields, genome, sjAnnot)
+    # try:
+    transcript = Transcript(sam_fields, genome, sjAnnot)
 
-    except Exception as e:
-        warnings.warn("Problem parsing transcript with ID '" +
-                      logInfo.TranscriptID + "'")
-        print(e)
-        return None, logInfo
+    # except Exception as e:
+    #     warnings.warn("Problem parsing transcript with ID '" +
+    #                   logInfo.TranscriptID + "'")
+    #     print(e)
+    #     return None, logInfo
 
     return transcript, logInfo
 
@@ -1365,7 +1365,9 @@ def combinedJunctionDist(dist_0, dist_1):
     """
     # If dist_0 and dist_1 have different signs, the combined distance is
     # the sum of their absolute values
-    if dist_0*dist_1 <= 0:
+    # https://stackoverflow.com/questions/66882/simplest-way-to-check-if-two-integers-have-same-sign
+    if ((dist_0<0)!=(dist_1<0)):
+    # if dist_0*dist_1 <= 0:
         combined_dist = abs(dist_0) + abs(dist_1)
     else:
         combined_dist = abs(abs(dist_0) - abs(dist_1))
diff --git a/intronBound.py b/intronBound.py
index ff848c7..fae293f 100644
--- a/intronBound.py
+++ b/intronBound.py
@@ -19,7 +19,7 @@ def __init__(self, transcriptID, jnNumber, ibNumber, chrom, pos, strand):
         self.strand = strand
 
     def getBED(self):
-        """ Format the intron boundary with 0-based start and end. 
+        """ Format the intron boundary with 0-based start and end.
          If mode == "start", we return a bed for the start position."""
 
         bedstr = "\t".join([self.chrom, str(self.pos - 1), str(self.pos),
@@ -31,9 +31,15 @@ def getSpliceMotif(self, genome):
             intron (first two if bound == 0 and last two if bound == 1) """
 
         if self.bound == 0:
-            motif = genome.sequence(
-                {'chr': self.chrom, 'start': self.pos, 'stop': self.pos + 1}, one_based=True)
+            # motif = genome.sequence(
+            #     {'chr': self.chrom, 'start': self.pos, 'stop': self.pos + 1}, one_based=True)
+            motif = genome.get_seq(self.chrom,
+                                   self.pos,
+                                   self.pos+1).seq
         else:
-            motif = genome.sequence(
-                {'chr': self.chrom, 'start': self.pos - 1, 'stop': self.pos}, one_based=True)
+            # motif = genome.sequence(
+            #     {'chr': self.chrom, 'start': self.pos - 1, 'stop': self.pos}, one_based=True)
+            motif = genome.get_seq(self.chrom,
+                                   self.pos-1,
+                                   self.pos).seq
         return motif
diff --git a/testing_suite/test_all_jns_annotated.py b/testing_suite/test_all_jns_annotated.py
index edd3ba4..4e994ee 100644
--- a/testing_suite/test_all_jns_annotated.py
+++ b/testing_suite/test_all_jns_annotated.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
@@ -11,7 +11,7 @@ class TestAllSJsAnnot(object):
     def test_no_jns(self):
         """ Return transcript.allJnsAnnotated = True for a transcript without
             junctions"""
-        
+
         sam = "input_files/sams/perfectReferenceMatch_noIntrons.sam"
         genome = Fasta("input_files/hg38_chr1.fa")
 
@@ -56,7 +56,7 @@ def test_two_annotated_SJs_without_ref(self):
         assert transcript.isCanonical == True
 
     def test_noncanonical(self):
-        """ Transcript should be noncanonical and un-annotated prior to 
+        """ Transcript should be noncanonical and un-annotated prior to
             correction, but be canonical and annotated afterwards """
 
         sam = "input_files/sams/deletion_insertion_mismatch_nc.sam"
@@ -70,15 +70,14 @@ def test_noncanonical(self):
 
         with open(sam, 'r') as f:
             sam_line = f.readline().strip()
-            transcript, logInfo = TC.transcript_init(sam_line, refs.genome, 
+            transcript, logInfo = TC.transcript_init(sam_line, refs.genome,
                                                      refs.sjAnnot)
 
         assert transcript.allJnsAnnotated == False
         assert transcript.isCanonical == False
 
         # Now correct the junction and retest
-        upd_transcript, TE = TC.cleanNoncanonical(transcript, refs, 5, logInfo)        
+        upd_transcript, TE = TC.cleanNoncanonical(transcript, refs, 5, logInfo)
 
         assert upd_transcript.allJnsAnnotated == True
         assert upd_transcript.isCanonical == True
-
diff --git a/testing_suite/test_attempt_jn_correction.py b/testing_suite/test_attempt_jn_correction.py
index 1b80f05..4caba48 100644
--- a/testing_suite/test_attempt_jn_correction.py
+++ b/testing_suite/test_attempt_jn_correction.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
diff --git a/testing_suite/test_canonOnly_mode.py b/testing_suite/test_canonOnly_mode.py
index a044448..a410c81 100644
--- a/testing_suite/test_canonOnly_mode.py
+++ b/testing_suite/test_canonOnly_mode.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 import os
 import subprocess
diff --git a/testing_suite/test_combined_jn_dist.py b/testing_suite/test_combined_jn_dist.py
index c322b45..13fc3b9 100644
--- a/testing_suite/test_combined_jn_dist.py
+++ b/testing_suite/test_combined_jn_dist.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import TranscriptClean as TC
diff --git a/testing_suite/test_compare_seq_cigar_lengths.py b/testing_suite/test_compare_seq_cigar_lengths.py
index 20d2a60..d205429 100644
--- a/testing_suite/test_compare_seq_cigar_lengths.py
+++ b/testing_suite/test_compare_seq_cigar_lengths.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
diff --git a/testing_suite/test_compute_MD_NM.py b/testing_suite/test_compute_MD_NM.py
index a8089e4..9f65748 100644
--- a/testing_suite/test_compute_MD_NM.py
+++ b/testing_suite/test_compute_MD_NM.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
diff --git a/testing_suite/test_compute_jI_jM.py b/testing_suite/test_compute_jI_jM.py
index 111bf9e..839d8c8 100644
--- a/testing_suite/test_compute_jI_jM.py
+++ b/testing_suite/test_compute_jI_jM.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
diff --git a/testing_suite/test_correct_deletions.py b/testing_suite/test_correct_deletions.py
index 3bc6bb7..bb383e7 100644
--- a/testing_suite/test_correct_deletions.py
+++ b/testing_suite/test_correct_deletions.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
diff --git a/testing_suite/test_correct_insertions.py b/testing_suite/test_correct_insertions.py
index b7422d3..cab58e5 100644
--- a/testing_suite/test_correct_insertions.py
+++ b/testing_suite/test_correct_insertions.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
diff --git a/testing_suite/test_correct_mismatches.py b/testing_suite/test_correct_mismatches.py
index ce19bb5..b72f4df 100644
--- a/testing_suite/test_correct_mismatches.py
+++ b/testing_suite/test_correct_mismatches.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
diff --git a/testing_suite/test_example_transcripts.py b/testing_suite/test_example_transcripts.py
index d41cffd..f7b2e7b 100644
--- a/testing_suite/test_example_transcripts.py
+++ b/testing_suite/test_example_transcripts.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 import os
 import subprocess
diff --git a/testing_suite/test_example_varDM_monoexon.py b/testing_suite/test_example_varDM_monoexon.py
index 7d2acf1..80d15d8 100644
--- a/testing_suite/test_example_varDM_monoexon.py
+++ b/testing_suite/test_example_varDM_monoexon.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 import os
 import subprocess
diff --git a/testing_suite/test_fetch_donor_or_acceptor.py b/testing_suite/test_fetch_donor_or_acceptor.py
index e4e7be7..87debf4 100644
--- a/testing_suite/test_fetch_donor_or_acceptor.py
+++ b/testing_suite/test_fetch_donor_or_acceptor.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import spliceJunction as sj
diff --git a/testing_suite/test_find_closest_annotated_SJ.py b/testing_suite/test_find_closest_annotated_SJ.py
index a40f7f2..b0049aa 100644
--- a/testing_suite/test_find_closest_annotated_SJ.py
+++ b/testing_suite/test_find_closest_annotated_SJ.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import spliceJunction as sj
diff --git a/testing_suite/test_find_closest_annotated_bound.py b/testing_suite/test_find_closest_annotated_bound.py
index e2d3f81..fa5b0ee 100644
--- a/testing_suite/test_find_closest_annotated_bound.py
+++ b/testing_suite/test_find_closest_annotated_bound.py
@@ -6,7 +6,7 @@
 import intronBound as ib
 import spliceJunction as sj
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 
 
 @pytest.mark.unit
diff --git a/testing_suite/test_find_ncsj.py b/testing_suite/test_find_ncsj.py
index 4101c19..aafbda6 100644
--- a/testing_suite/test_find_ncsj.py
+++ b/testing_suite/test_find_ncsj.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
diff --git a/testing_suite/test_fix_one_side_of_junction.py b/testing_suite/test_fix_one_side_of_junction.py
index fd4558a..6b2efdb 100644
--- a/testing_suite/test_fix_one_side_of_junction.py
+++ b/testing_suite/test_fix_one_side_of_junction.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
diff --git a/testing_suite/test_full_dryrun.py b/testing_suite/test_full_dryrun.py
index 130d031..503bf27 100644
--- a/testing_suite/test_full_dryrun.py
+++ b/testing_suite/test_full_dryrun.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 import os
 import subprocess
diff --git a/testing_suite/test_high_level_ncsj_correction.py b/testing_suite/test_high_level_ncsj_correction.py
index 619f2c0..86fd764 100644
--- a/testing_suite/test_high_level_ncsj_correction.py
+++ b/testing_suite/test_high_level_ncsj_correction.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import os
 import sys
 sys.path.append("..")
diff --git a/testing_suite/test_init_transcript_object.py b/testing_suite/test_init_transcript_object.py
index 1038a59..5129fe1 100644
--- a/testing_suite/test_init_transcript_object.py
+++ b/testing_suite/test_init_transcript_object.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import TranscriptClean as TC
diff --git a/testing_suite/test_print_fasta_seq.py b/testing_suite/test_print_fasta_seq.py
index c355a63..7e35f50 100644
--- a/testing_suite/test_print_fasta_seq.py
+++ b/testing_suite/test_print_fasta_seq.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as ts
diff --git a/testing_suite/test_rmTmp_option.py b/testing_suite/test_rmTmp_option.py
index b325f91..579cc53 100644
--- a/testing_suite/test_rmTmp_option.py
+++ b/testing_suite/test_rmTmp_option.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 import os
 import subprocess
diff --git a/testing_suite/test_update_post_ncsj_correction.py b/testing_suite/test_update_post_ncsj_correction.py
index 0e216c7..353f9f6 100644
--- a/testing_suite/test_update_post_ncsj_correction.py
+++ b/testing_suite/test_update_post_ncsj_correction.py
@@ -1,5 +1,5 @@
 import pytest
-from pyfasta import Fasta
+from pyfaidx import Fasta
 import sys
 sys.path.append("..")
 import transcript as t2
diff --git a/transcript.py b/transcript.py
index 460de59..49bacfa 100644
--- a/transcript.py
+++ b/transcript.py
@@ -72,7 +72,7 @@ def __init__(self, samFields, genome, spliceAnnot):
         self.otherFields = "\t".join(otherFields)
 
     def recheckJnsAnnotated(self):
-        """ Check the splice motif of each splice junction to determine 
+        """ Check the splice motif of each splice junction to determine
             whether the transcript overall is annotated """
         for jn in self.spliceJunctions:
             if int(jn.motif_code) < 20:
@@ -115,8 +115,8 @@ def compute_transcript_end(self):
         return end - 1
 
     def splitCIGAR(self):
-        """ Takes CIGAR string from SAM and splits it into two lists: 
-            one with capital letters (match operators), and one with 
+        """ Takes CIGAR string from SAM and splits it into two lists:
+            one with capital letters (match operators), and one with
             the number of bases that each operation applies to. """
 
         #alignTypes = re.sub('[0-9]', " ", self.CIGAR).split()
@@ -126,8 +126,8 @@ def splitCIGAR(self):
         return splitCIGARstr(self.CIGAR)  # alignTypes, counts
 
     def splitMD(self):
-        """ Takes MD tag and splits into two lists: 
-            one with capital letters (match operators), and one with 
+        """ Takes MD tag and splits into two lists:
+            one with capital letters (match operators), and one with
             the number of bases that each operation applies to. """
 
         MD = str(self.MD).split(":")[2]
@@ -157,9 +157,9 @@ def splitMD(self):
         return operations, counts
 
     def mergeMDwithCIGAR(self):
-        """ Takes the MD and CIGAR strings, and combines them into a unified 
+        """ Takes the MD and CIGAR strings, and combines them into a unified
             structure that encodes all possible operations w.r.t the reference:
-            match, mismatch, deletion, insertion, hard clipping, 
+            match, mismatch, deletion, insertion, hard clipping,
             and soft clipping. """
 
         mergeCounts = []
@@ -213,7 +213,7 @@ def mergeMDwithCIGAR(self):
         return mergeOperations, mergeCounts
 
     def parseSpliceJunctions(self, genome, spliceAnnot):
-        """ Takes the splice junction information from the SAM input and 
+        """ Takes the splice junction information from the SAM input and
             creates a SpliceJunction object for each junction."""
 
         intronBounds = ((self.jI).split(":")[-1]).split(",")[1:]
@@ -278,7 +278,7 @@ def getAllIntronBounds(self):
         return result
 
     def getNMandMDFlags(self, genome):
-        """ This function uses the transcript sequence, its CIGAR string, 
+        """ This function uses the transcript sequence, its CIGAR string,
             and the reference genome to create NM and MD sam flags."""
         NM = 0
         MD = "MD:Z:"
@@ -297,8 +297,9 @@ def getNMandMDFlags(self, genome):
             if op == "M":
                 for i in range(0, ct):
                     currBase = self.SEQ[seqPos]
-                    refBase = genome.sequence({'chr': self.CHROM, 'start': genomePos,
-                                               'stop': genomePos}, one_based=True)
+                    refBase = genome.get_seq(self.CHROM, genomePos, genomePos).seq
+                    # refBase = genome.sequence({'chr': self.CHROM, 'start': genomePos,
+                    #                            'stop': genomePos}, one_based=True)
                     if refBase == "":
                         return None, None
 
@@ -319,8 +320,10 @@ def getNMandMDFlags(self, genome):
                 # End any match we have going and add the missing reference bases
                 MD = MD + str(MVal)
                 MVal = 0
-                refBases = genome.sequence({'chr': self.CHROM, 'start': genomePos,
-                                            'stop': genomePos + ct - 1}, one_based=True)
+                refBases = genome.get_seq(self.CHROM, genomePos,
+                                          genomePos+ct-1).seq
+                # refBases = genome.sequence({'chr': self.CHROM, 'start': genomePos,
+                #                             'stop': genomePos + ct - 1}, one_based=True)
                 if refBases == "":
                     return None, None
                 MD = MD + "^" + str(refBases)
@@ -390,11 +393,11 @@ def compute_jI(self):
         return jIstr
 
     def getjMandjITags(self, genome, spliceAnnot):
-        """ If the input sam file doesn't have the custom STARlong-derived jM 
-            and jI tags, we need to compute them. This is done by stepping 
-            through the CIGAR string and sequence. When an intron (N) is 
-            encountered, we check the first two bases and last two bases of 
-            the intron in the genome sequence to detemine whether they are 
+        """ If the input sam file doesn't have the custom STARlong-derived jM
+            and jI tags, we need to compute them. This is done by stepping
+            through the CIGAR string and sequence. When an intron (N) is
+            encountered, we check the first two bases and last two bases of
+            the intron in the genome sequence to detemine whether they are
             canonical. We also record the start and end position of the intron. """
 
         seq = self.SEQ
@@ -410,14 +413,17 @@ def getjMandjITags(self, genome, spliceAnnot):
             if op == "N":
                 # This is an intron
                 intronStart = genomePos
-                startBases = genome.sequence({'chr': self.CHROM,
-                                              'start': genomePos,
-                                              'stop': genomePos + 1},
-                                             one_based=True)
+                # startBases = genome.sequence({'chr': self.CHROM,
+                #                               'start': genomePos,
+                #                               'stop': genomePos + 1},
+                #                              one_based=True)
+                startBases = genome.get_seq(self.CHROM, genomePos,
+                                            genomePos+1).seq
                 intronEnd = genomePos + ct - 1
-                endBases = genome.sequence({'chr': self.CHROM,
-                                            'start': intronEnd - 1,
-                                            'stop': intronEnd}, one_based=True)
+                # endBases = genome.sequence({'chr': self.CHROM,
+                #                             'start': intronEnd - 1,
+                #                             'stop': intronEnd}, one_based=True)
+                endBases = genome.get_seq(self.CHROM, intronEnd-1, intron_end).seq
 
                 # Check if junction is annotated
                 if self.strand == "+":
@@ -465,7 +471,7 @@ def base_wise_CIGAR(self):
 
     def fetch_region_sequence(self, chromosome, start, end):
         """ Walks the SAM sequence to return the bases in the specified region.
-            Returns None if the sequence is not available, i.e. because the 
+            Returns None if the sequence is not available, i.e. because the
             region does not overlap with the transcript, or because it is in
             an intron. Supplied coordinates should be 1-based. """
 
@@ -533,7 +539,7 @@ def getSJMotifCode(startBases, endBases):
 
 
 def reverseComplement(seq):
-    """ Returns the reverse complement of a DNA sequence, 
+    """ Returns the reverse complement of a DNA sequence,
         retaining the case of each letter"""
     complement = ""