-
Notifications
You must be signed in to change notification settings - Fork 1
/
run.py
169 lines (144 loc) · 7.79 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python3
import argparse
import os
from progress.bar import Bar
import shutil
import subprocess
import sys
import traceback
from include.misc import fasta, gff3, string
from include.prrsv_orf5_rflp import prrsv_orf5_rflp
from include.sequence_annotate import sequence_annotate
from include.settings import bash_path, minimal_stderr, is_circular, is_double
from include.species_identify import species_identify
################################################################################
spar_path = os.path.dirname(os.path.realpath(__file__))
valid_organism_li = [d for d in os.listdir(spar_path+"/required/") if os.path.isdir(spar_path+"/required/"+d) and d != "blastdb"]
if bash_path != None:
os.environ["PATH"] = bash_path
################################################################################
################################################################################
def main():
'''Main method : parse input arguments and run appropriate operation'''
def str2bool(v):
out = string.convert_bool(v)
if out == None:
raise argparse.ArgumentTypeError('Boolean value expected.')
return(out)
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest="process", required=True, help="Type of input (rflp or annotate). See specific help for more options based on input type chosen e.g. python run.py rflp --help")
# annotate
sp_annotate = subparsers.add_parser("annotate", help="") #!
sp_annotate.add_argument("input", type=str, help="Input FASTA file name")
sp_annotate.add_argument("--limit", type=str, help="Annotates only genomic features provided in comma delimited list")
sp_annotate.add_argument("--organism", type=str, help="Possible organism identity of sequences in input FASTA file provided in comma delimited list")
sp_annotate.add_argument("--output", type=str, help="Output gff3 file name")
# rflp
sp_rflp = subparsers.add_parser("rflp", help="") #!
sp_rflp.add_argument("input", type=str, help="Input FASTA file name")
sp_rflp.add_argument("--full", type=str2bool, default=True, help="Only assign RFLP pattern to input sequences that are complete")
sp_rflp.add_argument("--output", type=str, help="Output FASTA file name")
args = parser.parse_args()
# logging.info("Input Arguments : %s", args) #?
read_fasta_li2 = fasta.read(args.input)
# build random, non-existent directory name
temp_dir = string.unique_directory(spar_path+"/temporary/")
subprocess.call("mkdir -p "+temp_dir, shell=True)
# variable assignment based on invoked subparser option
if args.process == "annotate":
input_organism_li, choose_match_li = [], []
if args.organism != None:
input_organism_li = args.organism.split(",")
if args.limit != None:
choose_match_li = args.limit.split(",")
if args.process == "rflp":
input_organism_li, choose_match_li = ["PRRSV1", "PRRSV2"], ["ORF5"]
# Run
# processes input fasta based on provided list of possible organism/s and feature/s
annotate_li4 = [[] for x in read_fasta_li2[0]]
input_organism_li = [x for x in input_organism_li if x in valid_organism_li]
if len(input_organism_li) == 0:
input_organism_li = valid_organism_li[:]
# check required bash utilities
required_program_li = ["hmmalign", "hmmbuild", "mafft", "awk"]
if "prrsv_orf5_RFLP" in sys.argv[0] or len(input_organism_li) != 1:
required_program_li += ["blastn", "makeblastdb", "blastdbcmd"]
missing_program_li = [x for x in required_program_li if shutil.which(x) == None]
if len(missing_program_li) > 0:
print("The following dependencies could not be found:", ", ".join(missing_program_li)+".", file=sys.stderr)
print("Please install and rerun.", file=sys.stderr)
else:
if minimal_stderr: # ignore affirmative sequence processing and error traceback
original_stderr = sys.stderr
sys.stderr = open(os.devnull, "w")
# check selected organism dependencies
for organism in input_organism_li:
if not sequence_annotate.prep.main(organism):
print("Removing "+organism+" from searchable organisms.", file=sys.stderr)
input_organism_li = [x for x in input_organism_li if x != organism]
fasta_di, sum_len = {}, len(read_fasta_li2[1])
if len(input_organism_li) != 1:
print("Identifying species for "+str(len(read_fasta_li2[1]))+" sequences", file=sys.stderr)
identify = species_identify(temp_dir)
fasta_di = identify.main(read_fasta_li2, input_organism_li)
sum_len = sum([len(fasta_di[x]) for x in fasta_di])
blast_complete_str = "Species identification complete"
if len(read_fasta_li2[1]) > sum_len:
blast_complete_str += " ("+str(len(read_fasta_li2[1])-sum_len)+" sequences could not be identified)"
print(blast_complete_str, file=sys.stderr)
else:
fasta_di[input_organism_li[0]] = [[index, sequence] for index, sequence in enumerate(read_fasta_li2[1])]
print("Annotating "+str(sum_len)+" sequences...", file=sys.stderr)
bar = Bar("Progress", fill='#', suffix='%(percent)d%%', max=sum_len) # track progress with bar visual
try:
# annotate by organism
for organism in fasta_di:
annotate = sequence_annotate.process(organism, temp_dir)
choose_match_li = [x for x in choose_match_li if x in annotate.total_cds]
if choose_match_li == []:
choose_match_li = annotate.total_cds
for di_li in fasta_di[organism]: # use original FASTA order
total_annotate_li2 = annotate.main(di_li[1], choose_match_li)
if len(total_annotate_li2) > 0:
annotate_li4[di_li[0]] = [organism, total_annotate_li2]
bar.next()
except:
traceback.print_exc(file=sys.stderr)
print(read_fasta_li2[0], file=sys.stderr) #?
bar.finish()
print("Annotation process complete", file=sys.stderr)
if minimal_stderr:
sys.stderr.close()
sys.stderr = original_stderr
#annotate argument
if args.process == "annotate":
output_gff3 = gff3.write(read_fasta_li2, annotate_li4, spar_path+"/required/")
if args.output != None:
write_file = open(args.output, "w")
write_file.write(output_gff3)
write_file.close()
else:
print(output_gff3)
#rflp argument
if args.process == "rflp":
rflp = prrsv_orf5_rflp(temp_dir)
print("Assigning RFLP patterns to "+str(len(annotate_li4))+" sequences...", file=sys.stderr)
bar = Bar("Progress", fill='#', suffix='%(percent)d%%', max=len(annotate_li4)) # track progress with bar visual
for fasta_index, annotate_li3 in enumerate(annotate_li4):
if len(annotate_li3) > 0 and annotate_li3[0] == "PRRSV2":
read_fasta_li2[0][fasta_index] += "/"+rflp.main(read_fasta_li2[1][fasta_index], annotate_li3[1], args.full)
else:
read_fasta_li2[0][fasta_index] += "/na" # invalid organism
bar.next()
bar.finish()
print("RFLP pattern assignment complete", file=sys.stderr)
if args.output != None and os.path.isfile(args.output):
fasta.write(args.output, read_fasta_li2)
else:
for fasta_index, head in enumerate(read_fasta_li2[0]):
print(">"+head)
print(read_fasta_li2[1][fasta_index])
# remove temporary directory
subprocess.call("rm -rf "+temp_dir, shell=True)
if __name__ == "__main__":
main()