multi-organism option for functional_search_by_organism

transcript · Jan 22, 2018 · a3f15e8 · a3f15e8
1 parent 33988e2
commit a3f15e8
Showing 1 changed file with 34 additions and 21 deletions.
diff --git a/functional_search_by_organism.py b/functional_search_by_organism.py
@@ -20,7 +20,7 @@
 ##########################################################################
 #
 # functional_search_by_organism.py
-# Created 5/10/16, last modified 5/31/16
+# Created 5/10/16, last modified 1/22/18
 # Created by Sam Westreich, [email protected], github.com/transcript/
 #
 ##########################################################################
@@ -36,6 +36,7 @@
 # 
 # Inputs required:
 # -N <organism_name>		The name of the organism you want to search for.
+# -NX <organisms_file>		The name of a file with multiple organisms listed inside.
 # -O <organism_file>		The RefSeq organism output file.
 # -F <function_file> 		The RefSeq function output file.
 # -R <results_file>			The name given to the results file (outfile).
@@ -80,15 +81,17 @@ def file_len(fname):
 	print "USAGE STATEMENT"
 	print "-Q\tEnables quiet mode"
 	print "-N\tName of organism to search for"
+	print "-NX\tName of file of organisms to search for"
 	print "-O\tOrganism output file to be searched"
 	print "-F\tFunction output file to be searched"
 	print "-R\tOutput file (default is organism_name_results.tab)"
 	print "-I\tRemoval targets, list generated by long_tail_threshold.py, OPTIONAL"
 	sys.exit()
 
-if "-N" not in argv:
+if "-N " not in argv:
 	if "-I" not in argv:
-		sys.exit("Missing either:\n\t-N flag for organism of interest\n\t-I flag for list of organisms to remove")
+		if "-NX" not in argv:
+			sys.exit("Missing either:\n\t-N flag for organism of interest\n\t-I flag for list of organisms to remove")
 if "-O" not in argv:
 	sys.exit("Missing -O flag for organism input file")
 if "-F" not in argv:
@@ -108,7 +111,7 @@ def file_len(fname):
 	sys.exit("Unable to open organism MG-RAST output file.")
 
 # Part 1, v1: Get all MG-RAST IDs associated with a single organism of choice.
-if "-N" in argv:
+if "-N " in argv:
 	org_name = string_find("-N")
 
 	# counters
@@ -128,27 +131,34 @@ def file_len(fname):
 	t1 = time.clock()
 
 	# Results of searching org infile
-	print ("Organism output searched.  Time elapsed: " + str(t1-t0) + " seconds.")
+	print ("\nOrganism output searched.  Time elapsed: " + str(t1-t0) + " seconds.")
 	print (str(len(ID_list)) + " IDs in list, " + str(hit_counter) + " total hits to the organism named " + org_name + ".")
 
 	org_infile.close()
 
 # Part 1, v2: Get all MG-RAST IDs associated with all organisms from a file.
-if "-I" in argv:
-	removal_targets_filename = string_find("-I")
+if "-I" in argv or "-NX" in argv:
 	try:
+		if "-I" in argv:
+			removal_targets_filename = string_find("-I")
+		if "-NX" in argv:
+			removal_targets_filename = string_find("-NX")
 		removal_targets_file = open (removal_targets_filename, "r")
 	except IOError:
-		sys.exit ("Unable to open file of target organisms to be removed.")
+		sys.exit ("Unable to open file of target organisms.")
 
 	org_targets = []
 
 	for line in removal_targets_file:
-		splitline = line.split("\t")
-		if splitline[2] not in org_targets:
-			org_targets.append(splitline[2])
+		splitline = line.strip().split("\t")
+		if "-I" in argv:
+			if splitline[2] not in org_targets:
+				org_targets.append(splitline[2])
+		if "-NX" in argv:
+			if splitline[0] not in org_targets:
+				org_targets.append(splitline[0])
 	if quiet == False:
-		print ("Targets file indexed.  " + str(len(org_targets)) + " target organisms will be screened against in the functions database.")
+		print ("\nTargets file indexed.  " + str(len(org_targets)) + " target organisms will be screened against in the functions database.")
 
 	# counters
 	hit_counter = 0
@@ -168,13 +178,16 @@ def file_len(fname):
 				ID_list.append(splitline[1])
 		if org_line_counter % 100000 == 0:
 			if quiet == False:
-				print (str(float(org_line_counter)/float(org_file_length)*100) + "% completed\t-\t" +  str(org_line_counter) + " lines processed so far for MG-RAST org file.")
+				print (str(float(org_line_counter)/float(org_file_length)*100) + "% completed\t\t-\t" +  str(org_line_counter) + " lines processed so far for MG-RAST org file.")
 
 	t1 = time.clock()
 
 	# Results of searching org infile
 	print ("MG-RAST organism output searched.  Time elapsed: " + str(t1-t0) + " seconds.")
-	print (str(len(ID_list)) + " IDs in list, " + str(hit_counter) + " total hits to all organisms from the removal file.")
+	if "-I" in argv:
+		print (str(len(set(ID_list))) + " IDs in list, " + str(hit_counter) + " total hits to all organisms from the removal file.")
+	if "-NX" in argv:
+		print (str(len(set(ID_list))) + " IDs in list, " + str(hit_counter) + " total hits to all organisms from the organisms file.")
 
 	org_infile.close()
 
@@ -219,7 +232,7 @@ def file_len(fname):
 				print (str(line_counter) + " lines processed so far in function file.")
 
 	t3 = time.clock()
-				
+
 # This part is for searching only for a single organism.
 else:
 	# build a database of all func results
@@ -230,16 +243,16 @@ def file_len(fname):
 		try:
 			func_db[splitline[1]] = splitline[3]
 		except IndexError:
-	#		print line
 			continue
 
-	print "Functional database of all entries successfully assembled."
+	print "\nFunctional database of all entries successfully assembled."
 
 	# matching to results from org file
 	final_db = {}
 	tally_db = {}
 	error_counter = 0
 	process_counter = 0
+	error_list = []
 
 	t2 = time.clock()
 
@@ -254,8 +267,8 @@ def file_len(fname):
 				continue
 		except KeyError:
 			error_counter += 1
+			error_list.append(org_ID)
 			continue
-	#			print ("Not matched in functional database:\t" + org_ID)
 		if process_counter % 10000 == 0:
 			if quiet == False:
 				print (str(process_counter) + " lines processed so far in function file.")
@@ -266,7 +279,7 @@ def file_len(fname):
 if "-R" in argv:
 	outfile_name = string_find("-R")
 else:
-	if "-N" in argv:
+	if "-N" in argv or "-NX" in argv:
 		outfile_name = org_name + "_results.tab"
 	else:
 		outfile_name = func_infile_name[:-4] + ".thresholded.tab"
@@ -276,11 +289,11 @@ def file_len(fname):
 for k, v in sorted(tally_db.items(), key=lambda (k,v): -v):
 	outfile.write ( str(v) + "\t" + str(final_db[k]).strip() + "\t" + k + "\n") 
 
-print ("All matches searched.  Time elapsed: " + str(t3-t2) + " seconds.")
+print ("\nAll matches searched.  Time elapsed: " + str(t3-t2) + " seconds.")
 if "-I" in argv:
 	print ("Number of functional annotations removed: " + str(removed_counter))
 else:
 	print ("Number of different functional annotations found for organism " + org_name + ": " + str(len(set(final_db.values()))))
 
-print ("Error rate: " + str(error_counter) + " errors out of " + str(len(ID_list)) + " total annotations.")
+print ("Error rate: " + str(error_counter) + " errors out of " + str(len(ID_list)) + " total annotations, " + str(len(set(error_list))) + " unique transcripts not matched in functions.")