tariqdaouda · feghalya · Sep 21, 2020 · Sep 21, 2020 · Sep 14, 2021 · Sep 14, 2021
diff --git a/pyGeno/Exon.py b/pyGeno/Exon.py
@@ -108,7 +108,7 @@ def _load_bin_sequence(self) :
 		self.bin_UTR5 =  NucBinarySequence(self.UTR5)
 		self.bin_CDS =  NucBinarySequence(self.CDS)
 		self.bin_UTR3 =  NucBinarySequence(self.UTR3)
-		
+
 	def hasCDS(self) :
 		"""returns true or false depending on if the exon has a CDS"""
 		if self.CDS_start != None and self.CDS_end != None:

diff --git a/pyGeno/Protein.py b/pyGeno/Protein.py
@@ -55,10 +55,23 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) :
 		return pyGenoRabaObjectWrapper._makeLoadQuery(self, objectType, *args, **coolArgs)
 
 	def _load_sequences(self) :
-		if self.chromosome.number != 'MT':
-			self.sequence = uf.translateDNA(self.transcript.cDNA).rstrip('*')
-		else:
+		if self.chromosome.number == 'MT':
 			self.sequence = uf.translateDNA(self.transcript.cDNA, translTable_id='mt').rstrip('*')
+		elif self.transcript.selenocysteine is not None:
+			sequence = list(uf.translateDNA(self.transcript.cDNA))
+			for p in self.transcript.selenocysteine:
+				p_seq = self.transcript.positions[len(self.transcript.UTR5):].index(p)
+				if self.transcript.gene.strand == '-':
+					p_seq -= 2
+				if p_seq % 3:
+					print('WARNING: Selenocysteine position is not multiple of 3 (%s).' % self.transcript.id)
+				p_seq = p_seq // 3
+				if sequence[p_seq] != '*':
+					print('WARNING: Selenocysteine position is not erroneous 3 (%s).' % self.transcript.id)
+				sequence[p_seq] = 'U'
+			self.sequence = ''.join(sequence).rstrip('*')
+		else:
+			self.sequence = uf.translateDNA(self.transcript.cDNA).rstrip('*')
 
 
 	def getSequence(self):

diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py
@@ -92,46 +92,40 @@ def setV(k, v) :
 		UTR5 = []
 		UTR3 = []
 		exons = []
+		positions = []
 		prime5 = True
 		for ee in self.wrapped_object.exons :
 			e = pyGenoRabaObjectWrapper_metaclass._wrappers[Exon_Raba](wrapped_object_and_bag = (ee, getV('bagKey')))
 			self.exonsDict[(e.start, e.end)] = e
 			exons.append(e)
 			self.data.extend(e.data)
+			if self.gene.strand == '+':
+				positions.extend(range(e.start, e.end))
+			else:
+				positions.extend(range(e.end-1, e.start-1, -1))
 
 			if e.hasCDS() :
 				UTR5.append(''.join(e.UTR5))
 
-				if self.selenocysteine is not None:
-					for position in self.selenocysteine:
-						if e.CDS_start <= position <= e.CDS_end:
-
-							if e.strand == '+':
-								ajusted_position = position - e.CDS_start
-							else:
-								ajusted_position = e.CDS_end - position - 3
-
-							if e.CDS[ajusted_position] == 'T':
-								e.CDS = list(e.CDS)
-								e.CDS[ajusted_position] = '!'			
-
-				if len(cDNA) == 0 and e.frame != 0 :
-					e.CDS = e.CDS[e.frame:]
-
-					if e.strand == '+':
-						e.CDS_start += e.frame
-					else:
-						e.CDS_end -= e.frame
-
+				if len(cDNA) == 0 and e.frame != 0:
+					cDNA.append('N'*(3-e.frame))
+
 				if len(e.CDS):
 					cDNA.append(''.join(e.CDS))
+				else:
+					print('WARNING: hasCDS flag is incorrect for exon %s.' % e.id)
+
 				UTR3.append(''.join(e.UTR3))
 				prime5 = False
 			else :
 				if prime5 :
 					UTR5.append(''.join(e.data))
+					if len(e.UTR3):
+						print("WARNING: exon has 3'UTR before transcript starts (%s)." % self.id)
 				else :
 					UTR3.append(''.join(e.data))
+					if len(e.UTR5):
+						print("WARNING: exon has 5'UTR after transcript ends." % self.id)
 
 		sequence = ''.join(self.data)
 		cDNA = ''.join(cDNA)
@@ -142,6 +136,7 @@ def setV(k, v) :
 		setV('cDNA', cDNA)
 		setV('UTR5', UTR5)
 		setV('UTR3', UTR3)
+		setV('positions', positions)
 
 		if len(cDNA) > 0 and len(cDNA) % 3 != 0 :
 			setV('flags', {'DUBIOUS' : True, 'cDNA_LEN_NOT_MULT_3': True})

diff --git a/pyGeno/doc/source/conf.py b/pyGeno/doc/source/conf.py
@@ -64,7 +64,7 @@
 # The short X.Y version.
 version = '2.x'
 # The full version, including alpha/beta/rc tags.
-release = '2.0.x'
+release = '2.1.x'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/pyGeno/tools/UsefulFunctions.py b/pyGeno/tools/UsefulFunctions.py
@@ -87,9 +87,7 @@ def saveResults(directoryName, fileName, strResults, log = '', args = ''):
 'GTT' : 'V', 'GTC' : 'V', 'GTA' : 'V', 'GTG' : 'V',
 'GCT' : 'A', 'GCC' : 'A', 'GCA' : 'A', 'GCG' : 'A',
 'GAT' : 'D', 'GAC' : 'D', 'GAA' : 'E', 'GAG' : 'E',
-'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G',
-
-'!GA' : 'U'
+'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G'
 
 }
 codonTable = translTable['default']
@@ -207,7 +205,7 @@ def translateDNA_6Frames(sequence) :
 
 	return trans
 
-def translateDNA(sequence, frame = 'f1', translTable_id='default') :
+def translateDNA(sequence, frame = 'f1', translTable_id='default', ambiguous=False) :
 	"""Translates DNA code, frame : fwd1, fwd2, fwd3, rev1, rev2, rev3"""
 
 	protein = ""
@@ -232,20 +230,19 @@ def translateDNA(sequence, frame = 'f1', translTable_id='default') :
 	for i in range(0, len(dna),  3) :
 		codon = dna[i:i+3]
 
-		# Check if variant messed with selenocysteine codon
-		if '!' in codon and codon != '!GA':
-			codon = codon.replace('!', 'T')
-
 		if (len(codon) == 3) :
 			try :
 				# MC
 				protein += translTable[translTable_id][codon]
 			except KeyError :
-				combinaisons = polymorphicCodonCombinaisons(list(codon))
-				translations = set()
-				for ci in range(len(combinaisons)):
-					translations.add(translTable[translTable_id][combinaisons[ci]])
-				protein += '/'.join(translations)
+				if ambiguous:
+					combinaisons = polymorphicCodonCombinaisons(list(codon))
+					translations = set()
+					for ci in range(len(combinaisons)):
+						translations.add(translTable[translTable_id][combinaisons[ci]])
+					protein += '/'.join(translations)
+				else:
+					protein += 'X'
 
 	return protein
 

diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 setup(
     name='pyGeno',
 
-    version='2.0.1',
+    version='2.1.0',
 
     description='A python package for Personalized Genomics and Proteomics',
     long_description=long_description,