-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
multi-organism option for functional_search_by_organism
- Loading branch information
Sam Westreich
authored and
Sam Westreich
committed
Jan 22, 2018
1 parent
33988e2
commit a3f15e8
Showing
1 changed file
with
34 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,7 @@ | |
########################################################################## | ||
# | ||
# functional_search_by_organism.py | ||
# Created 5/10/16, last modified 5/31/16 | ||
# Created 5/10/16, last modified 1/22/18 | ||
# Created by Sam Westreich, [email protected], github.com/transcript/ | ||
# | ||
########################################################################## | ||
|
@@ -36,6 +36,7 @@ | |
# | ||
# Inputs required: | ||
# -N <organism_name> The name of the organism you want to search for. | ||
# -NX <organisms_file> The name of a file with multiple organisms listed inside. | ||
# -O <organism_file> The RefSeq organism output file. | ||
# -F <function_file> The RefSeq function output file. | ||
# -R <results_file> The name given to the results file (outfile). | ||
|
@@ -80,15 +81,17 @@ def file_len(fname): | |
print "USAGE STATEMENT" | ||
print "-Q\tEnables quiet mode" | ||
print "-N\tName of organism to search for" | ||
print "-NX\tName of file of organisms to search for" | ||
print "-O\tOrganism output file to be searched" | ||
print "-F\tFunction output file to be searched" | ||
print "-R\tOutput file (default is organism_name_results.tab)" | ||
print "-I\tRemoval targets, list generated by long_tail_threshold.py, OPTIONAL" | ||
sys.exit() | ||
|
||
if "-N" not in argv: | ||
if "-N " not in argv: | ||
if "-I" not in argv: | ||
sys.exit("Missing either:\n\t-N flag for organism of interest\n\t-I flag for list of organisms to remove") | ||
if "-NX" not in argv: | ||
sys.exit("Missing either:\n\t-N flag for organism of interest\n\t-I flag for list of organisms to remove") | ||
if "-O" not in argv: | ||
sys.exit("Missing -O flag for organism input file") | ||
if "-F" not in argv: | ||
|
@@ -108,7 +111,7 @@ def file_len(fname): | |
sys.exit("Unable to open organism MG-RAST output file.") | ||
|
||
# Part 1, v1: Get all MG-RAST IDs associated with a single organism of choice. | ||
if "-N" in argv: | ||
if "-N " in argv: | ||
org_name = string_find("-N") | ||
|
||
# counters | ||
|
@@ -128,27 +131,34 @@ def file_len(fname): | |
t1 = time.clock() | ||
|
||
# Results of searching org infile | ||
print ("Organism output searched. Time elapsed: " + str(t1-t0) + " seconds.") | ||
print ("\nOrganism output searched. Time elapsed: " + str(t1-t0) + " seconds.") | ||
print (str(len(ID_list)) + " IDs in list, " + str(hit_counter) + " total hits to the organism named " + org_name + ".") | ||
|
||
org_infile.close() | ||
|
||
# Part 1, v2: Get all MG-RAST IDs associated with all organisms from a file. | ||
if "-I" in argv: | ||
removal_targets_filename = string_find("-I") | ||
if "-I" in argv or "-NX" in argv: | ||
try: | ||
if "-I" in argv: | ||
removal_targets_filename = string_find("-I") | ||
if "-NX" in argv: | ||
removal_targets_filename = string_find("-NX") | ||
removal_targets_file = open (removal_targets_filename, "r") | ||
except IOError: | ||
sys.exit ("Unable to open file of target organisms to be removed.") | ||
sys.exit ("Unable to open file of target organisms.") | ||
|
||
org_targets = [] | ||
|
||
for line in removal_targets_file: | ||
splitline = line.split("\t") | ||
if splitline[2] not in org_targets: | ||
org_targets.append(splitline[2]) | ||
splitline = line.strip().split("\t") | ||
if "-I" in argv: | ||
if splitline[2] not in org_targets: | ||
org_targets.append(splitline[2]) | ||
if "-NX" in argv: | ||
if splitline[0] not in org_targets: | ||
org_targets.append(splitline[0]) | ||
if quiet == False: | ||
print ("Targets file indexed. " + str(len(org_targets)) + " target organisms will be screened against in the functions database.") | ||
print ("\nTargets file indexed. " + str(len(org_targets)) + " target organisms will be screened against in the functions database.") | ||
|
||
# counters | ||
hit_counter = 0 | ||
|
@@ -168,13 +178,16 @@ def file_len(fname): | |
ID_list.append(splitline[1]) | ||
if org_line_counter % 100000 == 0: | ||
if quiet == False: | ||
print (str(float(org_line_counter)/float(org_file_length)*100) + "% completed\t-\t" + str(org_line_counter) + " lines processed so far for MG-RAST org file.") | ||
print (str(float(org_line_counter)/float(org_file_length)*100) + "% completed\t\t-\t" + str(org_line_counter) + " lines processed so far for MG-RAST org file.") | ||
|
||
t1 = time.clock() | ||
|
||
# Results of searching org infile | ||
print ("MG-RAST organism output searched. Time elapsed: " + str(t1-t0) + " seconds.") | ||
print (str(len(ID_list)) + " IDs in list, " + str(hit_counter) + " total hits to all organisms from the removal file.") | ||
if "-I" in argv: | ||
print (str(len(set(ID_list))) + " IDs in list, " + str(hit_counter) + " total hits to all organisms from the removal file.") | ||
if "-NX" in argv: | ||
print (str(len(set(ID_list))) + " IDs in list, " + str(hit_counter) + " total hits to all organisms from the organisms file.") | ||
|
||
org_infile.close() | ||
|
||
|
@@ -219,7 +232,7 @@ def file_len(fname): | |
print (str(line_counter) + " lines processed so far in function file.") | ||
|
||
t3 = time.clock() | ||
|
||
# This part is for searching only for a single organism. | ||
else: | ||
# build a database of all func results | ||
|
@@ -230,16 +243,16 @@ def file_len(fname): | |
try: | ||
func_db[splitline[1]] = splitline[3] | ||
except IndexError: | ||
# print line | ||
continue | ||
|
||
print "Functional database of all entries successfully assembled." | ||
print "\nFunctional database of all entries successfully assembled." | ||
|
||
# matching to results from org file | ||
final_db = {} | ||
tally_db = {} | ||
error_counter = 0 | ||
process_counter = 0 | ||
error_list = [] | ||
|
||
t2 = time.clock() | ||
|
||
|
@@ -254,8 +267,8 @@ def file_len(fname): | |
continue | ||
except KeyError: | ||
error_counter += 1 | ||
error_list.append(org_ID) | ||
continue | ||
# print ("Not matched in functional database:\t" + org_ID) | ||
if process_counter % 10000 == 0: | ||
if quiet == False: | ||
print (str(process_counter) + " lines processed so far in function file.") | ||
|
@@ -266,7 +279,7 @@ def file_len(fname): | |
if "-R" in argv: | ||
outfile_name = string_find("-R") | ||
else: | ||
if "-N" in argv: | ||
if "-N" in argv or "-NX" in argv: | ||
outfile_name = org_name + "_results.tab" | ||
else: | ||
outfile_name = func_infile_name[:-4] + ".thresholded.tab" | ||
|
@@ -276,11 +289,11 @@ def file_len(fname): | |
for k, v in sorted(tally_db.items(), key=lambda (k,v): -v): | ||
outfile.write ( str(v) + "\t" + str(final_db[k]).strip() + "\t" + k + "\n") | ||
|
||
print ("All matches searched. Time elapsed: " + str(t3-t2) + " seconds.") | ||
print ("\nAll matches searched. Time elapsed: " + str(t3-t2) + " seconds.") | ||
if "-I" in argv: | ||
print ("Number of functional annotations removed: " + str(removed_counter)) | ||
else: | ||
print ("Number of different functional annotations found for organism " + org_name + ": " + str(len(set(final_db.values())))) | ||
|
||
print ("Error rate: " + str(error_counter) + " errors out of " + str(len(ID_list)) + " total annotations.") | ||
print ("Error rate: " + str(error_counter) + " errors out of " + str(len(ID_list)) + " total annotations, " + str(len(set(error_list))) + " unique transcripts not matched in functions.") | ||
|