From 818250e5ea790e89e4f555aa373b56c2c4289ae3 Mon Sep 17 00:00:00 2001
From: Albert Feghaly <albert.feghaly@umontreal.ca>
Date: Mon, 21 Sep 2020 12:22:01 -0400
Subject: [PATCH 1/6] Temporarily patch T->! bug in binary sequences

---
 pyGeno/Exon.py       | 14 ++++++++++++++
 pyGeno/Transcript.py | 17 +++--------------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/pyGeno/Exon.py b/pyGeno/Exon.py
index 8fa11ef..19c9617 100644
--- a/pyGeno/Exon.py
+++ b/pyGeno/Exon.py
@@ -109,6 +109,20 @@ def _load_bin_sequence(self) :
 		self.bin_CDS =  NucBinarySequence(self.CDS)
 		self.bin_UTR3 =  NucBinarySequence(self.UTR3)
 		
+	def _patch_seleno(self, e, selenocysteine):
+		if selenocysteine is not None:
+			for position in selenocysteine:
+				if e.CDS_start <= position <= e.CDS_end:
+
+					if e.strand == '+':
+						ajusted_position = position - e.CDS_start
+					else:
+						ajusted_position = e.CDS_end - position - 3
+
+					if e.CDS[ajusted_position] == 'T':
+						e.CDS = list(e.CDS)
+						e.CDS[ajusted_position] = '!'
+
 	def hasCDS(self) :
 		"""returns true or false depending on if the exon has a CDS"""
 		if self.CDS_start != None and self.CDS_end != None:
diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py
index 58e3263..adb7851 100644
--- a/pyGeno/Transcript.py
+++ b/pyGeno/Transcript.py
@@ -102,19 +102,8 @@ def setV(k, v) :
 			if e.hasCDS() :
 				UTR5.append(''.join(e.UTR5))
 
-				if self.selenocysteine is not None:
-					for position in self.selenocysteine:
-						if e.CDS_start <= position <= e.CDS_end:
-
-							if e.strand == '+':
-								ajusted_position = position - e.CDS_start
-							else:
-								ajusted_position = e.CDS_end - position - 3
-
-							if e.CDS[ajusted_position] == 'T':
-								e.CDS = list(e.CDS)
-								e.CDS[ajusted_position] = '!'			
-				
+                                e._patch_seleno(e, self.selenocysteine)
+
 				if len(cDNA) == 0 and e.frame != 0 :
 					e.CDS = e.CDS[e.frame:]
 					
@@ -151,7 +140,7 @@ def setV(k, v) :
 	def _load_bin_sequence(self) :
 		self.bin_sequence = NucBinarySequence(self.sequence)
 		self.bin_UTR5 =  NucBinarySequence(self.UTR5)
-		self.bin_cDNA =  NucBinarySequence(self.cDNA)
+		self.bin_cDNA =  NucBinarySequence(self.cDNA.replace('!', 'T' ))
 		self.bin_UTR3 =  NucBinarySequence(self.UTR3)
 
 	def getNucleotideCodon(self, cdnaX1) :

From 017c5f282716aea853158636b975dbd4f46bb762 Mon Sep 17 00:00:00 2001
From: Albert Feghaly <albert.feghaly@umontreal.ca>
Date: Mon, 21 Sep 2020 12:25:54 -0400
Subject: [PATCH 2/6] Indentation fix

---
 pyGeno/Transcript.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py
index adb7851..dc1d4ea 100644
--- a/pyGeno/Transcript.py
+++ b/pyGeno/Transcript.py
@@ -102,7 +102,7 @@ def setV(k, v) :
 			if e.hasCDS() :
 				UTR5.append(''.join(e.UTR5))
 
-                                e._patch_seleno(e, self.selenocysteine)
+				e._patch_seleno(e, self.selenocysteine)
 
 				if len(cDNA) == 0 and e.frame != 0 :
 					e.CDS = e.CDS[e.frame:]

From 81b0973d468edabd0dc451a84732329a0bcfd84d Mon Sep 17 00:00:00 2001
From: Albert Feghaly <albert.feghaly@umontreal.ca>
Date: Mon, 13 Sep 2021 20:28:19 -0400
Subject: [PATCH 3/6] Do not return all amino acid combinations by default

---
 pyGeno/tools/UsefulFunctions.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/pyGeno/tools/UsefulFunctions.py b/pyGeno/tools/UsefulFunctions.py
index a614d5e..cb44ba3 100644
--- a/pyGeno/tools/UsefulFunctions.py
+++ b/pyGeno/tools/UsefulFunctions.py
@@ -207,7 +207,7 @@ def translateDNA_6Frames(sequence) :
 
 	return trans
 
-def translateDNA(sequence, frame = 'f1', translTable_id='default') :
+def translateDNA(sequence, frame = 'f1', translTable_id='default', ambiguous=False) :
 	"""Translates DNA code, frame : fwd1, fwd2, fwd3, rev1, rev2, rev3"""
 
 	protein = ""
@@ -232,20 +232,19 @@ def translateDNA(sequence, frame = 'f1', translTable_id='default') :
 	for i in range(0, len(dna),  3) :
 		codon = dna[i:i+3]
 
-		# Check if variant messed with selenocysteine codon
-		if '!' in codon and codon != '!GA':
-			codon = codon.replace('!', 'T')
-
 		if (len(codon) == 3) :
 			try :
 				# MC
 				protein += translTable[translTable_id][codon]
 			except KeyError :
-				combinaisons = polymorphicCodonCombinaisons(list(codon))
-				translations = set()
-				for ci in range(len(combinaisons)):
-					translations.add(translTable[translTable_id][combinaisons[ci]])
-				protein += '/'.join(translations)
+				if ambiguous:
+					combinaisons = polymorphicCodonCombinaisons(list(codon))
+					translations = set()
+					for ci in range(len(combinaisons)):
+						translations.add(translTable[translTable_id][combinaisons[ci]])
+					protein += '/'.join(translations)
+				else:
+					protein += 'X'
 
 	return protein
 

From 3155744aea600a11bcca90230101f387dfc3d70f Mon Sep 17 00:00:00 2001
From: Albert Feghaly <albert.feghaly@umontreal.ca>
Date: Mon, 13 Sep 2021 20:32:33 -0400
Subject: [PATCH 4/6] Undo editing of exon objects in Transcript class

---
 pyGeno/Exon.py                  | 14 --------------
 pyGeno/Transcript.py            | 18 +++++++-----------
 pyGeno/tools/UsefulFunctions.py |  4 +---
 3 files changed, 8 insertions(+), 28 deletions(-)

diff --git a/pyGeno/Exon.py b/pyGeno/Exon.py
index 19c9617..0fa4452 100644
--- a/pyGeno/Exon.py
+++ b/pyGeno/Exon.py
@@ -108,20 +108,6 @@ def _load_bin_sequence(self) :
 		self.bin_UTR5 =  NucBinarySequence(self.UTR5)
 		self.bin_CDS =  NucBinarySequence(self.CDS)
 		self.bin_UTR3 =  NucBinarySequence(self.UTR3)
-		
-	def _patch_seleno(self, e, selenocysteine):
-		if selenocysteine is not None:
-			for position in selenocysteine:
-				if e.CDS_start <= position <= e.CDS_end:
-
-					if e.strand == '+':
-						ajusted_position = position - e.CDS_start
-					else:
-						ajusted_position = e.CDS_end - position - 3
-
-					if e.CDS[ajusted_position] == 'T':
-						e.CDS = list(e.CDS)
-						e.CDS[ajusted_position] = '!'
 
 	def hasCDS(self) :
 		"""returns true or false depending on if the exon has a CDS"""
diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py
index dc1d4ea..4b17461 100644
--- a/pyGeno/Transcript.py
+++ b/pyGeno/Transcript.py
@@ -102,18 +102,14 @@ def setV(k, v) :
 			if e.hasCDS() :
 				UTR5.append(''.join(e.UTR5))
 
-				e._patch_seleno(e, self.selenocysteine)
-
-				if len(cDNA) == 0 and e.frame != 0 :
-					e.CDS = e.CDS[e.frame:]
-					
-					if e.strand == '+':
-						e.CDS_start += e.frame
-					else:
-						e.CDS_end -= e.frame
-				
+				if len(cDNA) == 0 and e.frame != 0:
+					cDNA.append('N'*(3-e.frame))
+
 				if len(e.CDS):
 					cDNA.append(''.join(e.CDS))
+				else:
+					print('WARNING: hasCDS flag is incorrect for exon %s.' % e.id)
+
 				UTR3.append(''.join(e.UTR3))
 				prime5 = False
 			else :
@@ -140,7 +136,7 @@ def setV(k, v) :
 	def _load_bin_sequence(self) :
 		self.bin_sequence = NucBinarySequence(self.sequence)
 		self.bin_UTR5 =  NucBinarySequence(self.UTR5)
-		self.bin_cDNA =  NucBinarySequence(self.cDNA.replace('!', 'T' ))
+		self.bin_cDNA =  NucBinarySequence(self.cDNA)
 		self.bin_UTR3 =  NucBinarySequence(self.UTR3)
 
 	def getNucleotideCodon(self, cdnaX1) :
diff --git a/pyGeno/tools/UsefulFunctions.py b/pyGeno/tools/UsefulFunctions.py
index cb44ba3..7b00ee0 100644
--- a/pyGeno/tools/UsefulFunctions.py
+++ b/pyGeno/tools/UsefulFunctions.py
@@ -87,9 +87,7 @@ def saveResults(directoryName, fileName, strResults, log = '', args = ''):
 'GTT' : 'V', 'GTC' : 'V', 'GTA' : 'V', 'GTG' : 'V',
 'GCT' : 'A', 'GCC' : 'A', 'GCA' : 'A', 'GCG' : 'A',
 'GAT' : 'D', 'GAC' : 'D', 'GAA' : 'E', 'GAG' : 'E',
-'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G',
-
-'!GA' : 'U'
+'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G'
 
 }
 codonTable = translTable['default']

From f9c2b6ae1697cc3c5a001e15c1fab47f534f84ff Mon Sep 17 00:00:00 2001
From: Albert Feghaly <albert.feghaly@umontreal.ca>
Date: Mon, 13 Sep 2021 21:17:36 -0400
Subject: [PATCH 5/6] Selenocysteine problem fixed

---
 pyGeno/Protein.py    | 19 ++++++++++++++++---
 pyGeno/Transcript.py | 10 ++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/pyGeno/Protein.py b/pyGeno/Protein.py
index f14fed9..e96d04e 100644
--- a/pyGeno/Protein.py
+++ b/pyGeno/Protein.py
@@ -55,10 +55,23 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) :
 		return pyGenoRabaObjectWrapper._makeLoadQuery(self, objectType, *args, **coolArgs)
 	
 	def _load_sequences(self) :
-		if self.chromosome.number != 'MT':
-			self.sequence = uf.translateDNA(self.transcript.cDNA).rstrip('*')
-		else:
+		if self.chromosome.number == 'MT':
 			self.sequence = uf.translateDNA(self.transcript.cDNA, translTable_id='mt').rstrip('*')
+		elif self.transcript.selenocysteine is not None:
+			sequence = list(uf.translateDNA(self.transcript.cDNA))
+			for p in self.transcript.selenocysteine:
+				p_seq = self.transcript.positions[len(self.transcript.UTR5):].index(p)
+				if self.transcript.gene.strand == '-':
+					p_seq -= 2
+				if p_seq % 3:
+					print('WARNING: Selenocysteine position is not multiple of 3 (%s).' % self.transcript.id)
+				p_seq = p_seq // 3
+				if sequence[p_seq] != '*':
+					print('WARNING: Selenocysteine position is not erroneous 3 (%s).' % self.transcript.id)
+				sequence[p_seq] = 'U'
+			self.sequence = ''.join(sequence).rstrip('*')
+		else:
+			self.sequence = uf.translateDNA(self.transcript.cDNA).rstrip('*')
 
 	
 	def getSequence(self):
diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py
index 4b17461..cf0a0c5 100644
--- a/pyGeno/Transcript.py
+++ b/pyGeno/Transcript.py
@@ -92,12 +92,17 @@ def setV(k, v) :
 		UTR5 = []
 		UTR3 = []
 		exons = []
+		positions = []
 		prime5 = True
 		for ee in self.wrapped_object.exons :
 			e = pyGenoRabaObjectWrapper_metaclass._wrappers[Exon_Raba](wrapped_object_and_bag = (ee, getV('bagKey')))
 			self.exonsDict[(e.start, e.end)] = e
 			exons.append(e)
 			self.data.extend(e.data)
+			if self.gene.strand == '+':
+				positions.extend(range(e.start, e.end))
+			else:
+				positions.extend(range(e.end-1, e.start-1, -1))
 			
 			if e.hasCDS() :
 				UTR5.append(''.join(e.UTR5))
@@ -115,8 +120,12 @@ def setV(k, v) :
 			else :
 				if prime5 :
 					UTR5.append(''.join(e.data))
+					if len(e.UTR3):
+						print("WARNING: exon has 3'UTR before transcript starts (%s)." % self.id)
 				else :
 					UTR3.append(''.join(e.data))
+					if len(e.UTR5):
+						print("WARNING: exon has 5'UTR after transcript ends." % self.id)
 		
 		sequence = ''.join(self.data)
 		cDNA = ''.join(cDNA)
@@ -127,6 +136,7 @@ def setV(k, v) :
 		setV('cDNA', cDNA)
 		setV('UTR5', UTR5)
 		setV('UTR3', UTR3)
+		setV('positions', positions)
 		
 		if len(cDNA) > 0 and len(cDNA) % 3 != 0 :
 			setV('flags', {'DUBIOUS' : True, 'cDNA_LEN_NOT_MULT_3': True})

From 77fd4dd4313461b8d3ebd81d6031bd7a563129bc Mon Sep 17 00:00:00 2001
From: Albert Feghaly <albert.feghaly@umontreal.ca>
Date: Wed, 15 Sep 2021 23:19:52 -0400
Subject: [PATCH 6/6] Bump version to 2.1.0

---
 pyGeno/doc/source/conf.py | 2 +-
 setup.py                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyGeno/doc/source/conf.py b/pyGeno/doc/source/conf.py
index 3aa2fbe..cb78ae3 100644
--- a/pyGeno/doc/source/conf.py
+++ b/pyGeno/doc/source/conf.py
@@ -64,7 +64,7 @@
 # The short X.Y version.
 version = '2.x'
 # The full version, including alpha/beta/rc tags.
-release = '2.0.x'
+release = '2.1.x'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/setup.py b/setup.py
index 0970c45..96c4a56 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
 setup(
     name='pyGeno',
 
-    version='2.0.1',
+    version='2.1.0',
 
     description='A python package for Personalized Genomics and Proteomics',
     long_description=long_description,