-
Notifications
You must be signed in to change notification settings - Fork 6
/
md5-R-results-query.py
210 lines (179 loc) · 5.83 KB
/
md5-R-results-query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/env Python
##########################################################################
#
# Copyright (C) 2015-2016 Sam Westreich
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation;
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##########################################################################
#
# md5-R-results-query.py
# Created 1/26/16, last edited 3/10/16
# Created by Sam Westreich, [email protected], github.com/transcript/
#
##########################################################################
#
# This should take a list of MG-RAST m5nr IDs (this one from R results table)
# and return the RefSeq accessions.
#
# USAGE OPTIONS:
#
# -I Input file name, required
# -O Output file name, optional (default is infile_extIDs.tab)
# --resume If the input file was partially annotated using this program, this
# command allows for resuming at the last location not annotated.
# Optional.
#
##########################################################################
import sys, os, time, subprocess, commands
# String searching function:
def string_find(usage_term):
for idx, elem in enumerate(sys.argv):
this_elem = elem
next_elem = sys.argv[(idx + 1) % len(sys.argv)]
if elem.upper() == usage_term:
return next_elem
# pull ARGV
argv = str(sys.argv).upper()
# quiet mode
if "-Q" in argv:
quiet = True
else:
quiet = False
print ("\nCOMMAND USED:\t" + " ".join(sys.argv) + "\n")
# usage statement
if "-usage" in sys.argv:
print "USAGE STATEMENT"
print "-I\tSpecifies input file name, required"
print "-O\tSpecifies output file name, optional"
print "--resume\tResumes annotating IDs to file that already started but was interrupted, optional"
sys.exit()
else:
if quiet == False:
print "For usage options, run with flag '-usage'."
# warning if input file or threshold % isn't specified
if "-I" not in argv:
print "WARNING: No infile specified in ARGV (use '-I' flag). Terminating..."
sys.exit()
# input file
input_file_name = string_find("-I")
try:
infile = open (input_file_name, "r")
except IndexError:
sys.exit ("\nERROR:\nNo infile specified as ARGV using '-I' flag.\n")
# output file
if "-O" in argv:
outfile_name = string_find("-O")
else:
split_input = input_file_name.split(".")
outfile_name = split_input[0] + "_extIDs.tab"
outfile_db = []
# opening the output file
if "--RESUME" in argv:
if quiet == False:
print "Outfile exists"
outfile = open (outfile_name, "r")
# read in outfile completed lines to restart a list already in progress
for line in outfile:
try:
splitline = line.split("\t")
outfile_db.append(splitline[0])
except IndexError:
continue
if quiet == False:
print "Outfile read in."
outfile.close() # need to close it to resume reading from the beginning
outfile = open (outfile_name, "w")
else:
outfile = open (outfile_name, "w")
# variables
md5_db = {}
m5nr_ID_db = {}
i = 0
error_count = 0
line_counter = 0
# base command values for building the CURL request
base_command_start = "curl -X GET http://api.metagenomics.anl.gov/1/m5nr/md5/"
base_command_end = "?source=RefSeq"
# timing
t0 = time.clock()
# executing
for line in infile:
line_counter += 1
if i == 0:
i += 1
outfile.write("\t" + line)
else:
splitline = line.strip().split("\t", 1)
md5_db[splitline[0]] = splitline[1]
i += 1
# to make sure the program's still running:
if i % 100 == 0:
if quiet == False:
print ("Running... " + str(i) + " lines processed so far.")
command = base_command_start + splitline[0] + base_command_end
# for resuming, checks if ID has already been processed in output file
if "-resume" in sys.argv:
if splitline[0] in outfile_db:
continue
# note: currently using 'commands', which is deprecated but works for Unix systems and python 2.x
result = commands.getstatusoutput(command)
# breaks up curl gibberish to get just the accession ID, saved in m5nr_ID_db
splitresult_0 = str(result).split('accession":"')
try:
splitresult_1 = splitresult_0[1].split('"')
except IndexError:
error_count += 1
splitresult_1 = "unknown"
# Let's also get the function out of this:
splitfunction_0 = str(result).split('function":"')
try:
splitfunction_1 = splitfunction_0[1].split('"')
except IndexError:
splitfunction_1 = "unknown"
# Need to change the "W" that starts these IDs to a Y, because RefSeq
# NOTE: Currently turned off; not apparently necessary
# if str(splitresult_1[0])[0] == "W":
# RefSeq_ID = list(splitresult_1[0])
# RefSeq_ID[0] = "Y"
# RefSeq_ID = "".join(RefSeq_ID)
# else:
# RefSeq_ID = str(splitresult_1[0])
# below is for testing, delete after
# if i < 25:
# print line
# if "flagellin" in line:
# print command
# print ("\n" + str(result) + "\n")
# print str(splitfunction_1[0])
# print str(splitresult_1[0])
# print RefSeq_ID
# else:
# sys.exit()
RefSeq_ID = str(splitresult_1)
m5nr_ID_db[splitline[1]] = RefSeq_ID
# writing to outfile
outfile.write(splitline[0] + "\t" + splitline[1] + "\t" + RefSeq_ID + "\n")
# ticker
if line_counter % 1000 == 0:
print str(line_counter) + " lines processed so far."
# timing
t1 = time.clock()
# end reports
if quiet == False:
print ("Number of lines processed:\t" + str(i))
print ("Number of errors\t\t" + str(error_count))
print ("Time elapsed:\t\t" + str(t1-t0) + " seconds")
infile.close()
outfile.close()