-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmiRNACoordinatesCuration.py
335 lines (304 loc) · 11 KB
/
miRNACoordinatesCuration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# -*- coding: utf-8 -*-
"""
Created on Sat May 7 22:16:13 2016
@author: Richard
"""
from manipulate_sequences import *
from divergence import *
import os
import sys
# use this script to find the coordinates of the remanei mirnas in PX356 assembly
# options:
# [found/search]: add final coordinates if sequences are found, or edit sequences if search
step = sys.argv[1]
# add family conservation to mirnas
# get the seeds of distant caeno species
caenoSeeds = set()
infile = open('CBR_seeds.txt')
for line in infile:
line = line.rstrip()
if line != '':
caenoSeeds.add(line)
infile.close()
infile = open('CEL_seeds.txt')
for line in infile:
line = line.rstrip()
if line != '':
caenoSeeds.add(line)
infile.close()
infile = open('CBN_seeds.txt')
for line in infile:
line = line.rstrip()
if line != '':
caenoSeeds.add(line)
infile.close()
# get seeds in latens
latensSeeds = set()
infile = open('CLA_seeds.txt')
for line in infile:
line = line.rstrip()
if line != '':
latensSeeds.add(line)
infile.close()
if step == 'search':
# create a dict to store information about mirna {name :[list of info]}
mirnas = {}
infile = open('CRM_miRNAs_Coords_rj.txt')
header = infile.readline().rstrip().split('\t')
for line in infile:
if line.rstrip() != '':
line = line.rstrip().split('\t')
mirnas[line[0]] = line[1:]
infile.close()
# make sure strings are upper caps
for i in mirnas:
mirnas[i][0] = mirnas[i][0].upper()
mirnas[i][3] = mirnas[i][3].upper()
mirnas[i][4] = mirnas[i][4].upper()
# check family conservation
for mir in mirnas:
# get the seed sequence
seed = mirnas[mir][4]
if seed in caenoSeeds:
conservation = 'Caeno'
if seed not in caenoSeeds and seed in latensSeeds:
conservation = 'CrmCla'
if seed not in caenoSeeds and seed not in latensSeeds:
conservation = 'Crm'
# add conservation as varaible to list
mirnas[mir].append(conservation)
# add fields to header
header.append('family_conservation')
header.append('strand')
# create a list of mirna names
names = [i for i in mirnas]
names.sort()
# convert genome sequence to dict
genome = convert_fasta('../Genome_Files/noamb_356_v1_4.txt')
# create a dict with hairpin and sequences (+ and rev complement) extracted from genome
# {name :[hairpin, seq, reverse_complement]}
sequences = {}
for mir in mirnas:
# get hairpin
hairpin = mirnas[mir][0]
# get chromo
chromo = mirnas[mir][1]
# get start and end positions 0-based
start = int(mirnas[mir][2]) - 1
end = start + len(hairpin)
# extract forward sequence
seq = genome[chromo][start: end]
# extract reverse complement
revseq = reverse_complement(seq)
sequences[mir] = [hairpin, seq, revseq]
# create a dict to store differences between hairpin and extracted sequences
# {name :[diff_seq, diff_revseq]}
differences = {}
for i in mirnas:
diff1 = match_diff(sequences[i][0], sequences[i][1])
diff2 = match_diff(sequences[i][0], sequences[i][2])
differences[i] = [diff1, diff2]
# create lists of mirna names for divergent and related sequences
divergent, close, nodiff = [], [], []
try:
os.listdir('files_to_edit')
except:
os.mkdir('files_to_edit')
# loop over differences
for mir in differences:
# check if differences are large or low
# get mirna orientation
if differences[mir][0] <= 3:
strand = '+'
assert differences[mir][1] > 3, 'differences are not strand biased'
if differences[mir][0] > 0:
close.append(mir)
elif differences[mir][0] == 0:
nodiff.append(mir)
elif differences[mir][1] <= 3:
strand = '-'
assert differences[mir][0] > 3, 'differences are not strand biased'
if differences[mir][1] > 0:
close.append(mir)
elif differences[mir][1] == 0:
nodiff.append(mir)
elif differences[mir][0] > 3 and differences[mir][1] > 3:
strand = 'NA'
divergent.append(mir)
# add orientation to list
mirnas[mir].append(strand)
# get mature miR
mature = mirnas[mir][3]
# check if mirna needs manual curation
curation = False
if strand in '+-':
if strand == '+' and differences[mir][0] > 0:
# check if mature has differences
if mature not in sequences[mir][1]:
curation = True
else:
# no need for manual curation but replace hairpin with seq
mirnas[mir][0] = sequences[mir][1]
elif strand == '-' and differences[mir][1] > 0:
# check if mature has differences
if mature not in sequences[mir][2]:
curation = True
else:
# no need for manual curation but replace hairpin with revseq
mirnas[mir][0] = sequences[mir][2]
elif strand == 'NA':
curation = True
# check if manual curation is needed
if curation == True:
# add mature to file and do manual curation
newfile = open('files_to_edit/' + mir + '.fasta', 'w')
newfile.write('>crem\n')
newfile.write(sequences[mir][0] + '\n')
newfile.write('>seq\n')
newfile.write(sequences[mir][1] + '\n')
newfile.write('>revseq\n')
newfile.write(sequences[mir][2] + '\n')
newfile.write('>mature\n')
newfile.write(mature + '\n')
newfile.close()
# check that all mirnas are dealt with
assert len(mirnas) == len(nodiff) + len(close) + len(divergent), 'mirnas are missing'
# save information to file
newfile = open('CRM_miRNAsCoordinatesEdits.txt', 'w')
newfile.write('\t'.join(header) + '\n')
total = 0
for mir in nodiff:
newfile.write(mir + '\t' + '\t'.join(mirnas[mir]) + '\n')
total += 1
for mir in close:
newfile.write(mir + '\t' + '\t'.join(mirnas[mir]) + '\n')
total += 1
for mir in divergent:
newfile.write(mir + '\t' + '\t'.join(mirnas[mir]) + '\n')
total += 1
newfile.close()
# check that all mirnas are saved to file
assert len(mirnas) == total, 'some mirnas are not saved to file'
elif step == 'found':
# remove mirnas for which the mature miR doesn't match the hairpin sequence
to_remove = []
# convert file to dict, check that all mature seqs are in hairpin
infile = open('CRM_miRNAsCoordinatesEdited.txt')
header = infile.readline().rstrip().split('\t')
mirnas = {}
for line in infile:
line = line.rstrip()
if line != '':
line = line.split('\t')
if line[4] not in line[1]:
# check that mature in reverse complement of hairpin
if line[4] in reverse_complement(line[1]):
# change hairpin to its reverse complement
line[1] = reverse_complement(line[1])
else:
to_remove.append(line[0])
mirnas[line[0]] = line
infile.close()
# remove mirnas
for i in to_remove:
del mirnas[i]
# convert 6bp seed to 7bp seed
for mir in mirnas:
mirnas[mir][5] = mirnas[mir][4][1:8]
newfile = open('CRM_miRNAsCoordinatesFinal.txt', 'w')
newfile.write('\t'.join(header) + '\n')
for i in mirnas:
newfile.write('\t'.join(mirnas[i]) + '\n')
newfile.close()
# do some QC
infile = open('CRM_miRNAsCoordinatesFinal.txt')
infile.readline()
for line in infile:
line = line.rstrip()
if line != '':
line = line.split('\t')
# check that mature in hairpin
assert line[4] in line[1], 'mature not in hairpin'
# check that seed == 7bp
assert len(line[5]) == 7, 'seed is not 6bp'
# check that seed is seed
assert line[4][1:8] == line[5], 'seed does not match'
infile.close()
# add coordinates to each mirnas
mirnas = {}
infile = open('CRM_miRNAsCoordinatesFinal.txt')
header = infile.readline().rstrip().split('\t')
for line in infile:
line = line.rstrip()
if line != '':
line = line.split('\t')
mir = line[0]
mirnas[mir] = line
infile.close()
# modify header
# remove strand
header.remove(header[-1])
# remove start position
header.remove(header[3])
# correct typo
header[1] = 'Hairpin'
coord = ['start', 'end', 'orientation']
# extract chromo and add at first position in coord list
chromo = header.pop(2)
# add coordinates after chromo
coord.insert(0, chromo)
# add coordinates after mirna name
j = 1
for i in range(len(coord)):
header.insert(j+i, coord[i])
# convert genome sequence to dict
genome = convert_fasta('../Genome_Files/noamb_356_v1_4.txt')
# loop over mirnas, modifiy conservation and coordinates
for mir in mirnas:
# remove strand
mirnas[mir].remove(mirnas[mir][-1])
# remove conservation
mirnas[mir].remove(mirnas[mir][-1])
# remove start position
mirnas[mir].remove(mirnas[mir][3])
# get hairpin
hairpin = mirnas[mir][1]
# extract chromo
chromo = mirnas[mir].pop(2)
# find coordinates
if hairpin in genome[chromo]:
start = genome[chromo].index(hairpin)
end = start + len(hairpin)
orientation = '+'
elif reverse_complement(hairpin) in genome[chromo]:
start = genome[chromo].index(reverse_complement(hairpin))
end = start + len(hairpin)
orientation = '-'
else:
print(mir)
# create a list of coordinates, change coordinates to 1-based
coord = [chromo, str(start + 1), str(end), orientation]
# add coordinates after mirna name
j = 1
for i in range(len(coord)):
mirnas[mir].insert(j+i, coord[i])
# check family conservation
# get the seed sequence
seed = mirnas[mir][7]
if seed in caenoSeeds:
conservation = 'Caeno'
elif seed not in caenoSeeds and seed in latensSeeds:
conservation = 'CrmCla'
elif seed not in caenoSeeds and seed not in latensSeeds:
conservation = 'Crm'
# add conservation as varaible to list
mirnas[mir].append(conservation)
# save data to file
# create a list of mirna names
names = [i for i in mirnas]
names.sort()
newfile = open('CRM_miRNAsCoordinatesFinal.txt', 'w')
newfile.write('\t'.join(header) + '\n')
for i in names:
newfile.write('\t'.join(mirnas[i]) + '\n')
newfile.close()