-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprotein_divergence.py
294 lines (247 loc) · 10.6 KB
/
protein_divergence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import os
from manipulate_sequences import *
# use this function to get the relationships between genes and transcripts
def parent_gene(gff):
'''
(file) -> dict
Return a dictionnary with parent gene as key adn a list of transcripts
as value from the genome annotation GFF file
Precondition: parses the in-house remanei and latens GFF files
'''
# open file for reading
infile = open(gff, 'r')
# create dict to store the genes and transcripts
# {gene: [transcript1, transcript2]}
genes_ts = {}
# loop over file
for line in infile:
line = line.rstrip()
if line != '':
line = line.split()
if len(line) > 8:
# get the gene and transcript names
if line[2] == 'mRNA':
# get transcript name
transcript = line[8][line[8].index('ID=')+3 : line[8].index(';')]
# get gene name
parent = line[8][line[8].index('Parent=')+7: line[8].index(';', line[8].index('Parent'))]
# populate dict
if parent in genes_ts:
genes_ts[parent].append(transcript)
else:
genes_ts[parent] = [transcript]
infile.close()
return genes_ts
# use this function to clean up the file of 1:1 orthologs from transcripts that belong to a same gene
def clean_orthologs_file(best_blast_hits, crem_gff, clat_gff, outputfile):
'''
(file, file, file) -> file
Clean up the file with best blat hits from transcripts that belong to a same
gene. Write to the outputfile the orthologous pairs between remanei and latens
by keeping a single transcript per gene in each species
'''
# create dicts with {gene: [transcript1, transcript2]} in remanei and latens
remanei = parent_gene(crem_gff)
latens = parent_gene(clat_gff)
# create sets of genes for which transcripts are already used in ortholog pairs
rem_already_used = set()
lat_already_used = set()
# reverse dicts with transcript : gene pairs {transcript : gene}
remanei_ts = {}
for gene in remanei:
for ts in remanei[gene]:
remanei_ts[ts] = gene
latens_ts = {}
for gene in latens:
for ts in latens[gene]:
latens_ts[ts] = gene
# create a dict to store the ortholog pairs {crem_TS : clat_TS}
orthologs = {}
# open best blast hits file
infile = open(best_blast_hits, 'r')
# skip header
infile.readline()
# loop over file
for line in infile:
line = line.rstrip()
line = line.split()
clat_TS = line[0]
crem_TS = line[1]
# check that the remanei gene has not been used
if remanei_ts[crem_TS] not in rem_already_used:
# check that the latens gene has not been used
if latens_ts[clat_TS] not in lat_already_used:
# populate ortholog dict
orthologs[crem_TS] = clat_TS
# add corresponding genes to sets
rem_already_used.add(remanei_ts[crem_TS])
lat_already_used.add(latens_ts[clat_TS])
# open file for writing
newfile = open(outputfile, 'w')
# write header
newfile.write('# 1:1 orthologs between remanei and latens obtained by reciprocal BLAST hits\n')
newfile.write('# genes are represented by a single transcript\n')
newfile.write('# ambiguous sites in the reference genome are fixed\n')
newfile.write('Cremanei' + '\t' + 'Clatens' + '\n')
# write content to file
for gene in orthologs:
newfile.write(gene + '\t' + orthologs[gene] + '\n')
# close files
infile.close()
newfile.close()
# use this function to get the pairs of 1:1 orthologs
def orthologous_pairs(ortholog_file):
'''
(file) -> dict
Return a dictionnary with the pairs of orthologs from the ortholog_file
'''
# create a dict with gene names of orthologous pairs
# { species1: species2}
orthologs = {}
# open file for reading
infile = open(ortholog_file, 'r')
# skip comment lines
line = infile.readline()
while line.startswith('#'):
line = infile.readline()
# skip header
if not line.startswith('#'):
header = line
# loop over file, update dict
for line in infile:
line = line.rstrip()
if line !='':
line = line.split()
crem = line[0]
clat = line[1]
orthologs[crem] = clat
infile.close()
return orthologs
# use this function to write separate fasta files contenaing each the sequences of the orthologs
def save_orthologous_seq_pairs_to_file(ortholog_file, crem_CDS_file, cla_CDS_file, destination):
'''
(file, file, file) -> file
Save the CDS sequences of the orthologous pairs in separate fasta files,
by extracting the orthologous relationships from the ortholog file and the
CDS sequences from the CDS fasta files
'''
# convert CDS files into dictionnaries
crem_CDS = convert_fasta(crem_CDS_file)
cla_CDS = convert_fasta(cla_CDS_file)
# create a dict with gene names of orthologous pairs
# { species1: species2}
orthologs = orthologous_pairs(ortholog_file)
# loop over the pairs, grab the corresponding sequence, open file for writing and save sequences
for gene in orthologs:
# fetch the sequences of the cremanei and clatens transcripts
crem_seq = crem_CDS[gene]
cla_seq = cla_CDS[orthologs[gene]]
# open file for writing
outputfile = open(destination + gene + '_' + orthologs[gene] + '.tfa', 'w')
outputfile.write('>' + gene + '\n')
outputfile.write(crem_seq + '\n')
outputfile.write('>' + orthologs[gene] + '\n')
outputfile.write(cla_seq + '\n')
outputfile.close()
# use this function to convert the fasta alignments to a format readbale by codeml
def alignment_file_to_PAML_format(filename, destination):
'''
Open alignment file generated by t-coffee, copy content into a new file
with format compatible with PAML
'''
# check that file is the t-coffee DNA alignment file
if filename[-8:] == '_aln.tfa':
# convert sequences to dictionnary
alignment = convert_fasta(filename)
# get the name of the file without '_aln.tfa'
name = filename[:-8]
# make a list with sequence names
seq_names = [gene for gene in alignment]
# reverse order so that the remanei sequence is in the first
seq_names.reverse()
# open newfile for writing
newfile = open(destination + name + '.txt', 'w')
# add the number of sequences and the length of the alignment in the first line
newfile.write(str(len(alignment)) + ' ' + str(len(alignment[seq_names[0]])) + '\n')
# add orthologous sequences in fasta format
for gene in seq_names[:-1]:
newfile.write('>' + gene + '\n')
newfile.write(alignment[gene] + '\n')
newfile.write('>' + seq_names[-1] + '\n')
newfile.write(alignment[seq_names[-1]] + '\n')
newfile.close()
# use this function to create a codeml control file for a given alignment file
def generate_codeml_control_file(alignment_file, control_file, destination):
'''
(file, file, str) -> file
Take a PAML alignment file and template codeml control file and generate
a new control file, with parameters seqfile and outfile modified to
correspond to the alignment file
'''
# check that alignment_file is a text file
if alignment_file[-4:] == '.txt':
# get the file name
name = alignment_file[:-4]
# open control file for reading
ctlfile = open(control_file, 'r')
# open outputfile file for writing
newfile = open(destination + name + '_codeml.ctl', 'w')
# go through the control file, copy to new file with new input file and output file
for line in ctlfile:
if 'seqfile' not in line and 'outfile' not in line:
newfile.write(line)
elif 'seqfile' in line:
line = line.rstrip().split()
# replace the file name with the alignment file name
line[2] = alignment_file
for item in line[:-1]:
newfile.write(item + ' ')
newfile.write(line[-1] + '\n')
elif 'outfile' in line:
line = line.rstrip().split()
# replace the outputfile name with alignment_file.out
line[2] = name + '.out.txt'
for item in line[:-1]:
newfile.write(item + ' ')
newfile.write(line[-1] + '\n')
ctlfile.close()
newfile.close()
# use this function to parse the codeml output files and combine results into a single table
def save_divergence_results_to_file(crem_gff, cla_gff, outputfile, source):
'''
(file, file, file) -> file
Extract the dN and dS values from the codml output files,
and save a table to outputfile with columns including the gene names
and transcript names of each sequence using the GFF annotation files
'''
# make a list of files in current directory
files = os.listdir(source)
# open outfile for writing
newfile = open(source + outputfile, 'w')
newfile.write('Cremanei_transcript' + '\t' + 'Cremanei_gene' + '\t' + 'Clatens_transcript' + '\t' + 'Clatens_gene' + '\t' + 'dN' + '\t' + 'dS' + '\t' + 'dN/dS' + '\n')
# get latens transcript : gene pairs
cla_transcripts = transcript_to_gene(cla_gff)
# get remanei transcript : gene pairs
crem_transcripts = transcript_to_gene(crem_gff)
# loop over the files
for filename in files:
if '.out.txt' in filename:
infile = open(filename, 'r')
crem_TS = filename[:filename.index('_CLA')]
cla_TS = filename[filename.index('CLA'): filename.index('.out')]
for line in infile:
if 'tree length for dN' in line:
line = line.split()
dN = float(line[-1])
elif 'tree length for dS' in line:
line = line.split()
dS = float(line[-1])
crem_gene = crem_transcripts[crem_TS]
cla_gene = cla_transcripts[cla_TS]
if dS != 0:
omega = dN / dS
elif dS == 0:
omega = 'NA'
newfile.write(crem_TS + '\t' + crem_gene + '\t' + cla_TS + '\t' + cla_gene + '\t' + str(dN) + '\t' + str(dS) + '\t' + str(omega) + '\n')
infile.close()
newfile.close()