-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFormatCDSSnpDivergTable.py
241 lines (210 loc) · 7.63 KB
/
FormatCDSSnpDivergTable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 5 12:26:07 2015
@author: Richard
"""
from manipulate_sequences import *
import os
# open file file for reading
infile = open('cds_codon_table.txt', 'r')
# open file for writing
newfile = open('CDS_SNP_DIVERG.txt', 'w')
# get header
header = infile.readline().rstrip().split()
# insert column with KSR+PX reference allele
header.insert(13, 'KSR+PX_REF')
# add column with KSR+PX alternative allele
header.append('KSR+PX_ALT')
# write header to newfile
for item in header[:-1]:
newfile.write(item + '\t')
newfile.write(header[-1] + '\n')
# loop over file
for line in infile:
line = line.rstrip()
if line != '':
line = line.split()
# compute the number of reference allele in KSR + PX
ref = int(line[10]) + int(line[11])
# compute the number of alternative alleles in KSR + PX
alt = int(line[13]) + int(line[14])
# insert ref in line
line.insert(13, str(ref))
# add alt to line
line.append(str(alt))
# write line to new file
for item in line[:-1]:
newfile.write(item + '\t')
newfile.write(line[-1] + '\n')
# close files
infile.close()
newfile.close()
print('table updated with KSR+PX strains')
# make a list of alignment files
files = os.listdir('../CREM_CLA_protein_divergence/pairs/Aligned_pairs/')
# make a list of alignment files
alignments = [filename for filename in files if filename[-8:] == '_aln.tfa']
# make a dict with crm and cla sequences
orthologs = {}
for filename in alignments:
gene_name = filename[:filename.index('_CLA')]
sequences = {}
orthologs[gene_name] = {}
sequences = convert_fasta('../CREM_CLA_protein_divergence/pairs/Aligned_pairs/' + filename)
for seq in sequences:
if 'CRE' in seq:
orthologs[gene_name]['crm'] = sequences[seq]
else:
orthologs[gene_name]['cla'] = sequences[seq]
print('dict with orthologs done')
# make a dict with the remanei CDS sequences from the snp file
rem_CDS = {}
# open file for reading
infile = open('CDS_SNP_DIVERG.txt', 'r')
# skip header
infile.readline()
# loop over file, populate dict with the crem CDS sequence {gene: CDS}
for line in infile:
line = line.rstrip()
if line != '':
line = line.split()
# get transcript name
gene = line[2]
# build CDS sequence
if gene in rem_CDS:
rem_CDS[gene] += line[5]
else:
rem_CDS[gene] = line[5]
# close file after reading
infile.close()
print('dict with remanei CDS sequences done')
# stop codons were removed in the alignments but are present in the SNP file
# add fake nucleotides at the end of remanei and latens CDS if stop codon is missing
# to bring the sequences to same length
for gene in orthologs:
# check that gene has snps and is in the rem_CDS dict
if gene in rem_CDS:
# check if the remanei sequence in table has a stop codon
if rem_CDS[gene][-3:] in {'TAG', 'TGA', 'TAA'}:
# check if the rem sequences without stop codons differ
if orthologs[gene]['crm'].replace('-', '') != rem_CDS[gene][:-3]:
# if sequences differ raise ValueError
raise ValueError('CDS sequences with stop are different')
# check if the rem sequences have same length
if len(orthologs[gene]['crm'].replace('-', '')) != len(rem_CDS[gene]):
# if different length, add fake stop codon
orthologs[gene]['crm'] += 'XXX'
orthologs[gene]['cla'] += 'XXX'
elif rem_CDS[gene][-3:] not in {'TAG', 'TGA', 'TAA'}:
# check if rem sequences diff
if orthologs[gene]['crm'].replace('-', '') != rem_CDS[gene]:
raise ValueError('CDS sequences without STOP are different')
# no need to do anything to the sequence if stop codon is not there
print('done adding fake stop codons')
# create a dict with remanei genes as key and a dict of codon_position: list of codons as value
# {gene: {i: [crem_codon, cla_codon]}}
codons = {}
for gene in orthologs:
# get the remanei CDS sequence
crm_cds = orthologs[gene]['crm']
# get the latens CDS sequence
cla_cds = orthologs[gene]['cla']
# initiate gap variable
gaps = 0
# initialise j
j = 0
# initiate innder dict
codons[gene] = {}
# loop over the remanei CDS sequence
for i in range(0, len(crm_cds), 3):
if '-' in crm_cds[i:i+3]:
gaps += crm_cds[i:i+3].count('-')
else:
j = i - gaps
codons[gene][j] = [crm_cds[i:i+3], cla_cds[i:i+3]]
print('dict with codon positions done')
# open file for reading
infile = open('CDS_SNP_DIVERG.txt', 'r')
# create string to store the content of newfile
content = ''
# get header
header = infile.readline().rstrip().split()
# add columns to header
header.append('CLA_codon')
header.append('CLA_ref')
header.append('CLA_base_index')
header.append('CLA_codon_index')
header.append('CLA_position_in_codon')
header = '\t'.join(header)
# add header to content
content += (header + '\n')
# create a variable to track the gene ID
gene_ID = ''
# loop over file
for line in infile:
# make a list of line
line = line.rstrip()
if line != '':
line = line.split()
# get gene name
gene = line[2]
# check if gene has an ortholog with aligned sequences
if gene not in orthologs:
# no ortholog and ancestral state, add '???' for codon and '?' for ref
line.append('???')
line.append('?')
# make a string of line
line = '\t'.join(line)
# add line to content
content += (line + '\n')
elif gene in orthologs:
# ortholog exists, get the ancestral state
# make a list of codon positions, sort list
positions = [pos for pos in codons[gene]]
positions.sort()
# check if we are looping in the same gene or if we start with a new gene
if gene != gene_ID:
# initialise counters
i = 0
j = 0
k = 1
# start a new gene
# add first CLA codon
line.append(codons[gene][j][1])
# get the latens sequence gaped positions excluded
cla_seq = ''.join([codons[gene][pos][1] for pos in positions])
# add first nucleotide
line.append(cla_seq[i])
line.append(str(i))
line.append(str(j))
line.append(str(k))
# make string of line
newline = '\t'.join(line)
# add line to content
content += (newline + '\n')
# update i
i += 1
# update gene_ID
gene_ID = gene
elif gene == gene_ID:
# still the same gene
if k != 3:
line.append(codons[gene][j][1])
k += 1
elif k ==3:
k = 1
j += 3
line.append(codons[gene][j][1])
line.append(cla_seq[i])
line.append(str(i))
line.append(str(j))
line.append(str(k))
newline = '\t'.join(line)
content += (newline + '\n')
i += 1
infile.close()
print('done adding ancestral state')
newfile = open('CDS_SNP_DIVERG.txt', 'w')
newfile.write(content)
newfile.close()
print('snp file updated')