From 818250e5ea790e89e4f555aa373b56c2c4289ae3 Mon Sep 17 00:00:00 2001 From: Albert Feghaly Date: Mon, 21 Sep 2020 12:22:01 -0400 Subject: [PATCH 1/6] Temporarily patch T->! bug in binary sequences --- pyGeno/Exon.py | 14 ++++++++++++++ pyGeno/Transcript.py | 17 +++-------------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/pyGeno/Exon.py b/pyGeno/Exon.py index 8fa11ef..19c9617 100644 --- a/pyGeno/Exon.py +++ b/pyGeno/Exon.py @@ -109,6 +109,20 @@ def _load_bin_sequence(self) : self.bin_CDS = NucBinarySequence(self.CDS) self.bin_UTR3 = NucBinarySequence(self.UTR3) + def _patch_seleno(self, e, selenocysteine): + if selenocysteine is not None: + for position in selenocysteine: + if e.CDS_start <= position <= e.CDS_end: + + if e.strand == '+': + ajusted_position = position - e.CDS_start + else: + ajusted_position = e.CDS_end - position - 3 + + if e.CDS[ajusted_position] == 'T': + e.CDS = list(e.CDS) + e.CDS[ajusted_position] = '!' + def hasCDS(self) : """returns true or false depending on if the exon has a CDS""" if self.CDS_start != None and self.CDS_end != None: diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py index 58e3263..adb7851 100644 --- a/pyGeno/Transcript.py +++ b/pyGeno/Transcript.py @@ -102,19 +102,8 @@ def setV(k, v) : if e.hasCDS() : UTR5.append(''.join(e.UTR5)) - if self.selenocysteine is not None: - for position in self.selenocysteine: - if e.CDS_start <= position <= e.CDS_end: - - if e.strand == '+': - ajusted_position = position - e.CDS_start - else: - ajusted_position = e.CDS_end - position - 3 - - if e.CDS[ajusted_position] == 'T': - e.CDS = list(e.CDS) - e.CDS[ajusted_position] = '!' - + e._patch_seleno(e, self.selenocysteine) + if len(cDNA) == 0 and e.frame != 0 : e.CDS = e.CDS[e.frame:] @@ -151,7 +140,7 @@ def setV(k, v) : def _load_bin_sequence(self) : self.bin_sequence = NucBinarySequence(self.sequence) self.bin_UTR5 = NucBinarySequence(self.UTR5) - self.bin_cDNA = NucBinarySequence(self.cDNA) + self.bin_cDNA = NucBinarySequence(self.cDNA.replace('!', 'T' )) self.bin_UTR3 = NucBinarySequence(self.UTR3) def getNucleotideCodon(self, cdnaX1) : From 017c5f282716aea853158636b975dbd4f46bb762 Mon Sep 17 00:00:00 2001 From: Albert Feghaly Date: Mon, 21 Sep 2020 12:25:54 -0400 Subject: [PATCH 2/6] Indentation fix --- pyGeno/Transcript.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py index adb7851..dc1d4ea 100644 --- a/pyGeno/Transcript.py +++ b/pyGeno/Transcript.py @@ -102,7 +102,7 @@ def setV(k, v) : if e.hasCDS() : UTR5.append(''.join(e.UTR5)) - e._patch_seleno(e, self.selenocysteine) + e._patch_seleno(e, self.selenocysteine) if len(cDNA) == 0 and e.frame != 0 : e.CDS = e.CDS[e.frame:] From 81b0973d468edabd0dc451a84732329a0bcfd84d Mon Sep 17 00:00:00 2001 From: Albert Feghaly Date: Mon, 13 Sep 2021 20:28:19 -0400 Subject: [PATCH 3/6] Do not return all amino acid combinations by default --- pyGeno/tools/UsefulFunctions.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pyGeno/tools/UsefulFunctions.py b/pyGeno/tools/UsefulFunctions.py index a614d5e..cb44ba3 100644 --- a/pyGeno/tools/UsefulFunctions.py +++ b/pyGeno/tools/UsefulFunctions.py @@ -207,7 +207,7 @@ def translateDNA_6Frames(sequence) : return trans -def translateDNA(sequence, frame = 'f1', translTable_id='default') : +def translateDNA(sequence, frame = 'f1', translTable_id='default', ambiguous=False) : """Translates DNA code, frame : fwd1, fwd2, fwd3, rev1, rev2, rev3""" protein = "" @@ -232,20 +232,19 @@ def translateDNA(sequence, frame = 'f1', translTable_id='default') : for i in range(0, len(dna), 3) : codon = dna[i:i+3] - # Check if variant messed with selenocysteine codon - if '!' in codon and codon != '!GA': - codon = codon.replace('!', 'T') - if (len(codon) == 3) : try : # MC protein += translTable[translTable_id][codon] except KeyError : - combinaisons = polymorphicCodonCombinaisons(list(codon)) - translations = set() - for ci in range(len(combinaisons)): - translations.add(translTable[translTable_id][combinaisons[ci]]) - protein += '/'.join(translations) + if ambiguous: + combinaisons = polymorphicCodonCombinaisons(list(codon)) + translations = set() + for ci in range(len(combinaisons)): + translations.add(translTable[translTable_id][combinaisons[ci]]) + protein += '/'.join(translations) + else: + protein += 'X' return protein From 3155744aea600a11bcca90230101f387dfc3d70f Mon Sep 17 00:00:00 2001 From: Albert Feghaly Date: Mon, 13 Sep 2021 20:32:33 -0400 Subject: [PATCH 4/6] Undo editing of exon objects in Transcript class --- pyGeno/Exon.py | 14 -------------- pyGeno/Transcript.py | 18 +++++++----------- pyGeno/tools/UsefulFunctions.py | 4 +--- 3 files changed, 8 insertions(+), 28 deletions(-) diff --git a/pyGeno/Exon.py b/pyGeno/Exon.py index 19c9617..0fa4452 100644 --- a/pyGeno/Exon.py +++ b/pyGeno/Exon.py @@ -108,20 +108,6 @@ def _load_bin_sequence(self) : self.bin_UTR5 = NucBinarySequence(self.UTR5) self.bin_CDS = NucBinarySequence(self.CDS) self.bin_UTR3 = NucBinarySequence(self.UTR3) - - def _patch_seleno(self, e, selenocysteine): - if selenocysteine is not None: - for position in selenocysteine: - if e.CDS_start <= position <= e.CDS_end: - - if e.strand == '+': - ajusted_position = position - e.CDS_start - else: - ajusted_position = e.CDS_end - position - 3 - - if e.CDS[ajusted_position] == 'T': - e.CDS = list(e.CDS) - e.CDS[ajusted_position] = '!' def hasCDS(self) : """returns true or false depending on if the exon has a CDS""" diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py index dc1d4ea..4b17461 100644 --- a/pyGeno/Transcript.py +++ b/pyGeno/Transcript.py @@ -102,18 +102,14 @@ def setV(k, v) : if e.hasCDS() : UTR5.append(''.join(e.UTR5)) - e._patch_seleno(e, self.selenocysteine) - - if len(cDNA) == 0 and e.frame != 0 : - e.CDS = e.CDS[e.frame:] - - if e.strand == '+': - e.CDS_start += e.frame - else: - e.CDS_end -= e.frame - + if len(cDNA) == 0 and e.frame != 0: + cDNA.append('N'*(3-e.frame)) + if len(e.CDS): cDNA.append(''.join(e.CDS)) + else: + print('WARNING: hasCDS flag is incorrect for exon %s.' % e.id) + UTR3.append(''.join(e.UTR3)) prime5 = False else : @@ -140,7 +136,7 @@ def setV(k, v) : def _load_bin_sequence(self) : self.bin_sequence = NucBinarySequence(self.sequence) self.bin_UTR5 = NucBinarySequence(self.UTR5) - self.bin_cDNA = NucBinarySequence(self.cDNA.replace('!', 'T' )) + self.bin_cDNA = NucBinarySequence(self.cDNA) self.bin_UTR3 = NucBinarySequence(self.UTR3) def getNucleotideCodon(self, cdnaX1) : diff --git a/pyGeno/tools/UsefulFunctions.py b/pyGeno/tools/UsefulFunctions.py index cb44ba3..7b00ee0 100644 --- a/pyGeno/tools/UsefulFunctions.py +++ b/pyGeno/tools/UsefulFunctions.py @@ -87,9 +87,7 @@ def saveResults(directoryName, fileName, strResults, log = '', args = ''): 'GTT' : 'V', 'GTC' : 'V', 'GTA' : 'V', 'GTG' : 'V', 'GCT' : 'A', 'GCC' : 'A', 'GCA' : 'A', 'GCG' : 'A', 'GAT' : 'D', 'GAC' : 'D', 'GAA' : 'E', 'GAG' : 'E', -'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G', - -'!GA' : 'U' +'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G' } codonTable = translTable['default'] From f9c2b6ae1697cc3c5a001e15c1fab47f534f84ff Mon Sep 17 00:00:00 2001 From: Albert Feghaly Date: Mon, 13 Sep 2021 21:17:36 -0400 Subject: [PATCH 5/6] Selenocysteine problem fixed --- pyGeno/Protein.py | 19 ++++++++++++++++--- pyGeno/Transcript.py | 10 ++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/pyGeno/Protein.py b/pyGeno/Protein.py index f14fed9..e96d04e 100644 --- a/pyGeno/Protein.py +++ b/pyGeno/Protein.py @@ -55,10 +55,23 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) : return pyGenoRabaObjectWrapper._makeLoadQuery(self, objectType, *args, **coolArgs) def _load_sequences(self) : - if self.chromosome.number != 'MT': - self.sequence = uf.translateDNA(self.transcript.cDNA).rstrip('*') - else: + if self.chromosome.number == 'MT': self.sequence = uf.translateDNA(self.transcript.cDNA, translTable_id='mt').rstrip('*') + elif self.transcript.selenocysteine is not None: + sequence = list(uf.translateDNA(self.transcript.cDNA)) + for p in self.transcript.selenocysteine: + p_seq = self.transcript.positions[len(self.transcript.UTR5):].index(p) + if self.transcript.gene.strand == '-': + p_seq -= 2 + if p_seq % 3: + print('WARNING: Selenocysteine position is not multiple of 3 (%s).' % self.transcript.id) + p_seq = p_seq // 3 + if sequence[p_seq] != '*': + print('WARNING: Selenocysteine position is not erroneous 3 (%s).' % self.transcript.id) + sequence[p_seq] = 'U' + self.sequence = ''.join(sequence).rstrip('*') + else: + self.sequence = uf.translateDNA(self.transcript.cDNA).rstrip('*') def getSequence(self): diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py index 4b17461..cf0a0c5 100644 --- a/pyGeno/Transcript.py +++ b/pyGeno/Transcript.py @@ -92,12 +92,17 @@ def setV(k, v) : UTR5 = [] UTR3 = [] exons = [] + positions = [] prime5 = True for ee in self.wrapped_object.exons : e = pyGenoRabaObjectWrapper_metaclass._wrappers[Exon_Raba](wrapped_object_and_bag = (ee, getV('bagKey'))) self.exonsDict[(e.start, e.end)] = e exons.append(e) self.data.extend(e.data) + if self.gene.strand == '+': + positions.extend(range(e.start, e.end)) + else: + positions.extend(range(e.end-1, e.start-1, -1)) if e.hasCDS() : UTR5.append(''.join(e.UTR5)) @@ -115,8 +120,12 @@ def setV(k, v) : else : if prime5 : UTR5.append(''.join(e.data)) + if len(e.UTR3): + print("WARNING: exon has 3'UTR before transcript starts (%s)." % self.id) else : UTR3.append(''.join(e.data)) + if len(e.UTR5): + print("WARNING: exon has 5'UTR after transcript ends." % self.id) sequence = ''.join(self.data) cDNA = ''.join(cDNA) @@ -127,6 +136,7 @@ def setV(k, v) : setV('cDNA', cDNA) setV('UTR5', UTR5) setV('UTR3', UTR3) + setV('positions', positions) if len(cDNA) > 0 and len(cDNA) % 3 != 0 : setV('flags', {'DUBIOUS' : True, 'cDNA_LEN_NOT_MULT_3': True}) From 77fd4dd4313461b8d3ebd81d6031bd7a563129bc Mon Sep 17 00:00:00 2001 From: Albert Feghaly Date: Wed, 15 Sep 2021 23:19:52 -0400 Subject: [PATCH 6/6] Bump version to 2.1.0 --- pyGeno/doc/source/conf.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyGeno/doc/source/conf.py b/pyGeno/doc/source/conf.py index 3aa2fbe..cb78ae3 100644 --- a/pyGeno/doc/source/conf.py +++ b/pyGeno/doc/source/conf.py @@ -64,7 +64,7 @@ # The short X.Y version. version = '2.x' # The full version, including alpha/beta/rc tags. -release = '2.0.x' +release = '2.1.x' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index 0970c45..96c4a56 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name='pyGeno', - version='2.0.1', + version='2.1.0', description='A python package for Personalized Genomics and Proteomics', long_description=long_description,