diff --git a/pyGeno/Exon.py b/pyGeno/Exon.py index 8fa11ef..0fa4452 100644 --- a/pyGeno/Exon.py +++ b/pyGeno/Exon.py @@ -108,7 +108,7 @@ def _load_bin_sequence(self) : self.bin_UTR5 = NucBinarySequence(self.UTR5) self.bin_CDS = NucBinarySequence(self.CDS) self.bin_UTR3 = NucBinarySequence(self.UTR3) - + def hasCDS(self) : """returns true or false depending on if the exon has a CDS""" if self.CDS_start != None and self.CDS_end != None: diff --git a/pyGeno/Protein.py b/pyGeno/Protein.py index f14fed9..e96d04e 100644 --- a/pyGeno/Protein.py +++ b/pyGeno/Protein.py @@ -55,10 +55,23 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) : return pyGenoRabaObjectWrapper._makeLoadQuery(self, objectType, *args, **coolArgs) def _load_sequences(self) : - if self.chromosome.number != 'MT': - self.sequence = uf.translateDNA(self.transcript.cDNA).rstrip('*') - else: + if self.chromosome.number == 'MT': self.sequence = uf.translateDNA(self.transcript.cDNA, translTable_id='mt').rstrip('*') + elif self.transcript.selenocysteine is not None: + sequence = list(uf.translateDNA(self.transcript.cDNA)) + for p in self.transcript.selenocysteine: + p_seq = self.transcript.positions[len(self.transcript.UTR5):].index(p) + if self.transcript.gene.strand == '-': + p_seq -= 2 + if p_seq % 3: + print('WARNING: Selenocysteine position is not multiple of 3 (%s).' % self.transcript.id) + p_seq = p_seq // 3 + if sequence[p_seq] != '*': + print('WARNING: Selenocysteine position is not erroneous 3 (%s).' % self.transcript.id) + sequence[p_seq] = 'U' + self.sequence = ''.join(sequence).rstrip('*') + else: + self.sequence = uf.translateDNA(self.transcript.cDNA).rstrip('*') def getSequence(self): diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py index 58e3263..cf0a0c5 100644 --- a/pyGeno/Transcript.py +++ b/pyGeno/Transcript.py @@ -92,46 +92,40 @@ def setV(k, v) : UTR5 = [] UTR3 = [] exons = [] + positions = [] prime5 = True for ee in self.wrapped_object.exons : e = pyGenoRabaObjectWrapper_metaclass._wrappers[Exon_Raba](wrapped_object_and_bag = (ee, getV('bagKey'))) self.exonsDict[(e.start, e.end)] = e exons.append(e) self.data.extend(e.data) + if self.gene.strand == '+': + positions.extend(range(e.start, e.end)) + else: + positions.extend(range(e.end-1, e.start-1, -1)) if e.hasCDS() : UTR5.append(''.join(e.UTR5)) - if self.selenocysteine is not None: - for position in self.selenocysteine: - if e.CDS_start <= position <= e.CDS_end: - - if e.strand == '+': - ajusted_position = position - e.CDS_start - else: - ajusted_position = e.CDS_end - position - 3 - - if e.CDS[ajusted_position] == 'T': - e.CDS = list(e.CDS) - e.CDS[ajusted_position] = '!' - - if len(cDNA) == 0 and e.frame != 0 : - e.CDS = e.CDS[e.frame:] - - if e.strand == '+': - e.CDS_start += e.frame - else: - e.CDS_end -= e.frame - + if len(cDNA) == 0 and e.frame != 0: + cDNA.append('N'*(3-e.frame)) + if len(e.CDS): cDNA.append(''.join(e.CDS)) + else: + print('WARNING: hasCDS flag is incorrect for exon %s.' % e.id) + UTR3.append(''.join(e.UTR3)) prime5 = False else : if prime5 : UTR5.append(''.join(e.data)) + if len(e.UTR3): + print("WARNING: exon has 3'UTR before transcript starts (%s)." % self.id) else : UTR3.append(''.join(e.data)) + if len(e.UTR5): + print("WARNING: exon has 5'UTR after transcript ends." % self.id) sequence = ''.join(self.data) cDNA = ''.join(cDNA) @@ -142,6 +136,7 @@ def setV(k, v) : setV('cDNA', cDNA) setV('UTR5', UTR5) setV('UTR3', UTR3) + setV('positions', positions) if len(cDNA) > 0 and len(cDNA) % 3 != 0 : setV('flags', {'DUBIOUS' : True, 'cDNA_LEN_NOT_MULT_3': True}) diff --git a/pyGeno/doc/source/conf.py b/pyGeno/doc/source/conf.py index 3aa2fbe..cb78ae3 100644 --- a/pyGeno/doc/source/conf.py +++ b/pyGeno/doc/source/conf.py @@ -64,7 +64,7 @@ # The short X.Y version. version = '2.x' # The full version, including alpha/beta/rc tags. -release = '2.0.x' +release = '2.1.x' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/pyGeno/tools/UsefulFunctions.py b/pyGeno/tools/UsefulFunctions.py index a614d5e..7b00ee0 100644 --- a/pyGeno/tools/UsefulFunctions.py +++ b/pyGeno/tools/UsefulFunctions.py @@ -87,9 +87,7 @@ def saveResults(directoryName, fileName, strResults, log = '', args = ''): 'GTT' : 'V', 'GTC' : 'V', 'GTA' : 'V', 'GTG' : 'V', 'GCT' : 'A', 'GCC' : 'A', 'GCA' : 'A', 'GCG' : 'A', 'GAT' : 'D', 'GAC' : 'D', 'GAA' : 'E', 'GAG' : 'E', -'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G', - -'!GA' : 'U' +'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G' } codonTable = translTable['default'] @@ -207,7 +205,7 @@ def translateDNA_6Frames(sequence) : return trans -def translateDNA(sequence, frame = 'f1', translTable_id='default') : +def translateDNA(sequence, frame = 'f1', translTable_id='default', ambiguous=False) : """Translates DNA code, frame : fwd1, fwd2, fwd3, rev1, rev2, rev3""" protein = "" @@ -232,20 +230,19 @@ def translateDNA(sequence, frame = 'f1', translTable_id='default') : for i in range(0, len(dna), 3) : codon = dna[i:i+3] - # Check if variant messed with selenocysteine codon - if '!' in codon and codon != '!GA': - codon = codon.replace('!', 'T') - if (len(codon) == 3) : try : # MC protein += translTable[translTable_id][codon] except KeyError : - combinaisons = polymorphicCodonCombinaisons(list(codon)) - translations = set() - for ci in range(len(combinaisons)): - translations.add(translTable[translTable_id][combinaisons[ci]]) - protein += '/'.join(translations) + if ambiguous: + combinaisons = polymorphicCodonCombinaisons(list(codon)) + translations = set() + for ci in range(len(combinaisons)): + translations.add(translTable[translTable_id][combinaisons[ci]]) + protein += '/'.join(translations) + else: + protein += 'X' return protein diff --git a/setup.py b/setup.py index 0970c45..96c4a56 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name='pyGeno', - version='2.0.1', + version='2.1.0', description='A python package for Personalized Genomics and Proteomics', long_description=long_description,