Skip to content

Commit

Permalink
multi-organism option for functional_search_by_organism
Browse files Browse the repository at this point in the history
  • Loading branch information
Sam Westreich authored and Sam Westreich committed Jan 22, 2018
1 parent 33988e2 commit a3f15e8
Showing 1 changed file with 34 additions and 21 deletions.
55 changes: 34 additions & 21 deletions functional_search_by_organism.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
##########################################################################
#
# functional_search_by_organism.py
# Created 5/10/16, last modified 5/31/16
# Created 5/10/16, last modified 1/22/18
# Created by Sam Westreich, [email protected], github.com/transcript/
#
##########################################################################
Expand All @@ -36,6 +36,7 @@
#
# Inputs required:
# -N <organism_name> The name of the organism you want to search for.
# -NX <organisms_file> The name of a file with multiple organisms listed inside.
# -O <organism_file> The RefSeq organism output file.
# -F <function_file> The RefSeq function output file.
# -R <results_file> The name given to the results file (outfile).
Expand Down Expand Up @@ -80,15 +81,17 @@ def file_len(fname):
print "USAGE STATEMENT"
print "-Q\tEnables quiet mode"
print "-N\tName of organism to search for"
print "-NX\tName of file of organisms to search for"
print "-O\tOrganism output file to be searched"
print "-F\tFunction output file to be searched"
print "-R\tOutput file (default is organism_name_results.tab)"
print "-I\tRemoval targets, list generated by long_tail_threshold.py, OPTIONAL"
sys.exit()

if "-N" not in argv:
if "-N " not in argv:
if "-I" not in argv:
sys.exit("Missing either:\n\t-N flag for organism of interest\n\t-I flag for list of organisms to remove")
if "-NX" not in argv:
sys.exit("Missing either:\n\t-N flag for organism of interest\n\t-I flag for list of organisms to remove")
if "-O" not in argv:
sys.exit("Missing -O flag for organism input file")
if "-F" not in argv:
Expand All @@ -108,7 +111,7 @@ def file_len(fname):
sys.exit("Unable to open organism MG-RAST output file.")

# Part 1, v1: Get all MG-RAST IDs associated with a single organism of choice.
if "-N" in argv:
if "-N " in argv:
org_name = string_find("-N")

# counters
Expand All @@ -128,27 +131,34 @@ def file_len(fname):
t1 = time.clock()

# Results of searching org infile
print ("Organism output searched. Time elapsed: " + str(t1-t0) + " seconds.")
print ("\nOrganism output searched. Time elapsed: " + str(t1-t0) + " seconds.")
print (str(len(ID_list)) + " IDs in list, " + str(hit_counter) + " total hits to the organism named " + org_name + ".")

org_infile.close()

# Part 1, v2: Get all MG-RAST IDs associated with all organisms from a file.
if "-I" in argv:
removal_targets_filename = string_find("-I")
if "-I" in argv or "-NX" in argv:
try:
if "-I" in argv:
removal_targets_filename = string_find("-I")
if "-NX" in argv:
removal_targets_filename = string_find("-NX")
removal_targets_file = open (removal_targets_filename, "r")
except IOError:
sys.exit ("Unable to open file of target organisms to be removed.")
sys.exit ("Unable to open file of target organisms.")

org_targets = []

for line in removal_targets_file:
splitline = line.split("\t")
if splitline[2] not in org_targets:
org_targets.append(splitline[2])
splitline = line.strip().split("\t")
if "-I" in argv:
if splitline[2] not in org_targets:
org_targets.append(splitline[2])
if "-NX" in argv:
if splitline[0] not in org_targets:
org_targets.append(splitline[0])
if quiet == False:
print ("Targets file indexed. " + str(len(org_targets)) + " target organisms will be screened against in the functions database.")
print ("\nTargets file indexed. " + str(len(org_targets)) + " target organisms will be screened against in the functions database.")

# counters
hit_counter = 0
Expand All @@ -168,13 +178,16 @@ def file_len(fname):
ID_list.append(splitline[1])
if org_line_counter % 100000 == 0:
if quiet == False:
print (str(float(org_line_counter)/float(org_file_length)*100) + "% completed\t-\t" + str(org_line_counter) + " lines processed so far for MG-RAST org file.")
print (str(float(org_line_counter)/float(org_file_length)*100) + "% completed\t\t-\t" + str(org_line_counter) + " lines processed so far for MG-RAST org file.")

t1 = time.clock()

# Results of searching org infile
print ("MG-RAST organism output searched. Time elapsed: " + str(t1-t0) + " seconds.")
print (str(len(ID_list)) + " IDs in list, " + str(hit_counter) + " total hits to all organisms from the removal file.")
if "-I" in argv:
print (str(len(set(ID_list))) + " IDs in list, " + str(hit_counter) + " total hits to all organisms from the removal file.")
if "-NX" in argv:
print (str(len(set(ID_list))) + " IDs in list, " + str(hit_counter) + " total hits to all organisms from the organisms file.")

org_infile.close()

Expand Down Expand Up @@ -219,7 +232,7 @@ def file_len(fname):
print (str(line_counter) + " lines processed so far in function file.")

t3 = time.clock()

# This part is for searching only for a single organism.
else:
# build a database of all func results
Expand All @@ -230,16 +243,16 @@ def file_len(fname):
try:
func_db[splitline[1]] = splitline[3]
except IndexError:
# print line
continue

print "Functional database of all entries successfully assembled."
print "\nFunctional database of all entries successfully assembled."

# matching to results from org file
final_db = {}
tally_db = {}
error_counter = 0
process_counter = 0
error_list = []

t2 = time.clock()

Expand All @@ -254,8 +267,8 @@ def file_len(fname):
continue
except KeyError:
error_counter += 1
error_list.append(org_ID)
continue
# print ("Not matched in functional database:\t" + org_ID)
if process_counter % 10000 == 0:
if quiet == False:
print (str(process_counter) + " lines processed so far in function file.")
Expand All @@ -266,7 +279,7 @@ def file_len(fname):
if "-R" in argv:
outfile_name = string_find("-R")
else:
if "-N" in argv:
if "-N" in argv or "-NX" in argv:
outfile_name = org_name + "_results.tab"
else:
outfile_name = func_infile_name[:-4] + ".thresholded.tab"
Expand All @@ -276,11 +289,11 @@ def file_len(fname):
for k, v in sorted(tally_db.items(), key=lambda (k,v): -v):
outfile.write ( str(v) + "\t" + str(final_db[k]).strip() + "\t" + k + "\n")

print ("All matches searched. Time elapsed: " + str(t3-t2) + " seconds.")
print ("\nAll matches searched. Time elapsed: " + str(t3-t2) + " seconds.")
if "-I" in argv:
print ("Number of functional annotations removed: " + str(removed_counter))
else:
print ("Number of different functional annotations found for organism " + org_name + ": " + str(len(set(final_db.values()))))

print ("Error rate: " + str(error_counter) + " errors out of " + str(len(ID_list)) + " total annotations.")
print ("Error rate: " + str(error_counter) + " errors out of " + str(len(ID_list)) + " total annotations, " + str(len(set(error_list))) + " unique transcripts not matched in functions.")

0 comments on commit a3f15e8

Please sign in to comment.