-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathlong_tail_threshold.py
119 lines (105 loc) · 3.44 KB
/
long_tail_threshold.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env Python
##########################################################################
#
# Copyright (C) 2015-2016 Sam Westreich
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation;
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##########################################################################
#
# long_tail_threshold.py
# Created 2/05/16, last edited 3/10/16
# Created by Sam Westreich, [email protected], github.com/transcript/
#
##########################################################################
#
# Purpose: there are a lot of mismatches in the long tail of annotations
# (below 0.05% of total annotations). This script will remove these from a
# trimmed output file.
#
# USAGE OPTIONS:
#
# -T Specifies cutoff percentage (0.0001-100) for thresholding, required
# -I Input file name, required
# -O Output file name, optional (default is infile.thresholded)
# -Q Quiet mode, optional
#
##########################################################################
import sys, os
# String searching function:
def string_find(usage_term):
for idx, elem in enumerate(sys.argv):
this_elem = elem
next_elem = sys.argv[(idx + 1) % len(sys.argv)]
if elem.upper() == usage_term:
return next_elem
# pull ARGV
argv = str(sys.argv).upper()
# quiet mode
if "-Q" in argv:
quiet = True
else:
quiet = False
print ("\nCOMMAND USED:\t" + " ".join(sys.argv) + "\n")
# usage statement
if "-USAGE" in argv:
print "USAGE STATEMENT"
print "-Q\tEnables quiet mode"
print "-I\tSpecifies input file name, required"
print "-T\tSpecifies threshold cutoff for percentage of total reads; maximum is 100, required"
print "-O\tSpecifies output file name, optional"
sys.exit()
else:
if quiet == False:
print "For usage options, run with flag '-usage'."
# warning if input file or threshold % isn't specified
if "-I" not in argv:
print "WARNING: No infile specified in ARGV (use '-I' flag). Terminating..."
sys.exit()
if "-T" not in argv:
print "WARNING: No threshold cutoff specified in ARGV; threshold should be between 0 and 100. Terminating..."
sys.exit()
# infile
infile_name = string_find("-I")
try:
infile = open (infile_name, "r")
except IndexError:
print "WARNING: Cannot open infile!"
sys.exit()
# threshold percentage
threshold = string_find("-T")
if float(threshold) < 0 or float(threshold) > 100:
print "WARNING: Threshold value is not between 0 and 100. Terminating..."
sys.exit()
# outfile
if "-O" in argv:
outfile_name = string_find("-O")
if quiet == False:
print ("Output file name: " + outfile_name)
else:
if quiet == False:
print ("Using standard name: " + infile_name + ".thresholded")
outfile_name = infile_name + ".thresholded"
outfile = open (outfile_name, "w")
# executing!
for line in infile:
splitline = line.split("\t")
if float(splitline[0]) > float(threshold):
outfile.write(line)
else:
continue
if quiet == False:
print ("File processed: " + infile_name)
infile.close()
outfile.close()