-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_fasta_RELAX.py
43 lines (31 loc) · 1.75 KB
/
prepare_fasta_RELAX.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import sys, os
import argparse
import utils
def flip_stop_codon(nt_seq):
''' Given a nucleotide sequence find if it contains a stop codon '''
stop_codons = ['TAA', 'TAG', 'TGA']
codon_positions = list(range(0, len(nt_seq), 3))
codon_seq = [nt_seq[i:i + 3] for i in codon_positions]
return ''.join([codon if codon not in stop_codons else 'NNN' for codon in codon_seq])
if __name__ == '__main__':
## argument parsing business
parser = argparse.ArgumentParser(description = "Convert a CESAR generated multi-fasta to a file that can be used as an input to RELAX. Flips stop codons to NNN and removes insertions that disrupt the reading frame")
parser.add_argument("input", help = 'The input fasta', type = str)
parser.add_argument("output", help = 'The output fasta (RELAX-ready)', type = str)
args = parser.parse_args()
input_file = args.input
output_file = args.output
seqs_dict = utils.fasta_to_dict(input_file)
ref_seq = seqs_dict["hg38"]
ref_seq_gaps = utils.get_indels(ref_seq)
## get only the frame-disrupting indels
ref_seq_gaps_fd = {gap_pos: gap_length for gap_pos, gap_length in ref_seq_gaps.items() if gap_length%3 != 0}
for seq_id, aligned_seq in seqs_dict.items():
for gap_pos, gap_length in ref_seq_gaps_fd.items():
nt_seq_updated = aligned_seq[:gap_pos] + aligned_seq[gap_pos + gap_length:]
seqs_dict[seq_id] = nt_seq_updated
## after removing all inserts that wreck the reading frame, flip any potential stop-codons to NNN and write to the output file
with open(output_file, 'w') as FO:
for seq_id, aligned_seq in seqs_dict.items():
aligned_seq = flip_stop_codon(aligned_seq)
FO.write(f'>{seq_id}\n{aligned_seq}\n')