-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfastonex.py
95 lines (81 loc) · 3.33 KB
/
fastonex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
'''
FASTA to Leave NEXUS Converter - Convert a FASTA file into Leave NEXUS
Copyright (C) 2024 fonors, goncalof21, MadalenaFranco2 & scmdcunha
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
'''
import argparse
from sys import stderr, exit
parser = argparse.ArgumentParser(
prog = 'FASTA to NEXUS Converter',
description = 'Convert a FASTA formatted file into the NEXUS format.')
def user_args(argparser):
"""
Gets the arguments the user defined when running the script and returns them.
"""
argparser.add_argument("-i", "--input", help="Input FASTA file.")
user_args = argparser.parse_args()
return user_args
def fasta_todict(file):
"""
Takes a FASTA formatted file and returns a dictionary with the sequences.
"""
with open(file, "r") as file:
sequences = {}
for line in file:
line = line.strip()
if line.startswith(">"):
name = line[1:]
sequences[name] = ""
else:
sequences[name] += line
sequences[name] = sequences[name].lower()
return sequences
def nexus_data(seqdict):
"""
Takes a dictionary containing sequences and returns the NEXUS DATA header.
"""
for seq in seqdict:
if "u" in seqdict[seq]:
datatype = "FORMAT DATATYPE=RNA MISSING=N GAP=-;\n"
break
elif "t" in seqdict[seq]:
datatype = "FORMAT DATATYPE=DNA MISSING=N GAP=-;\n"
break
else:
print("The sequences provided aren't valid DNA or RNA sequences. Make sure your sequences only have the characters \"ATGC\" or \"AUGC\".", file=stderr)
exit()
maxseqlen = max(len(seqdict[seq]) for seq in seqdict)
nexus_header = f"#NEXUS\n\nBEGIN DATA;\nDIMENSIONS NTAX={str(len(seqdict))} NCHAR={str(maxseqlen)};\n" + datatype
return nexus_header
def nexus_matrix(seqdict):
"""
Takes a dictionary containing sequences and returns the NEXUS MATRIX block.
"""
nexus_matrix_block = "MATRIX\n\n"
maxseqlen = max(len(seqdict[seq]) for seq in seqdict)
for seq in seqdict:
if len(seqdict[seq]) < maxseqlen:
ngaps = maxseqlen - len(seqdict[seq])
nexus_matrix_block += f"{seq} {seqdict[seq]}"
for gaps in range(ngaps):
nexus_matrix_block += "-"
nexus_matrix_block += "\n"
else:
nexus_matrix_block += f"{seq} {seqdict[seq]}\n"
nexus_matrix_block += ";\n\nEND;"
return nexus_matrix_block
if __name__ == "__main__":
user_args = user_args(parser)
seq_dict = fasta_todict(user_args.input)
output = nexus_data(seq_dict) + nexus_matrix(seq_dict)
print(output)